Merge pull request #33063 from pmorie/node-e2e

Automatic merge from submit-queue

Make node E2E tests more transparent

Add some logging and minor code reorg to make the node E2E tests a little more transparent and understandable.
This commit is contained in:
Kubernetes Submit Queue 2016-09-22 08:22:11 -07:00 committed by GitHub
commit db07433782
6 changed files with 145 additions and 92 deletions

View File

@ -168,6 +168,7 @@ test-e2e: ginkgo generated_files
# INSTANCE_METADATA: For REMOTE=true and running on GCE only. # INSTANCE_METADATA: For REMOTE=true and running on GCE only.
# GUBERNATOR: For REMOTE=true only. Produce link to Gubernator to view logs. # GUBERNATOR: For REMOTE=true only. Produce link to Gubernator to view logs.
# Defaults to false. # Defaults to false.
# PARALLELISM: The number of gingko nodes to run. Defaults to 8.
# #
# Example: # Example:
# make test-e2e-node FOCUS=Kubelet SKIP=container # make test-e2e-node FOCUS=Kubelet SKIP=container

View File

@ -141,8 +141,9 @@ if [ $remote = true ] ; then
exit $? exit $?
else else
# Refresh sudo credentials if not running on GCE. # Refresh sudo credentials for local run
if ! ping -c 1 -q metadata.google.internal &> /dev/null; then if ! ping -c 1 -q metadata.google.internal &> /dev/null; then
echo "Updating sudo credentials"
sudo -v || exit 1 sudo -v || exit 1
fi fi

View File

@ -47,6 +47,7 @@ func main() {
if err != nil { if err != nil {
glog.Fatalf("Failed to get build output directory: %v", err) glog.Fatalf("Failed to get build output directory: %v", err)
} }
glog.Infof("Got build output dir: %v", outputDir)
ginkgo := filepath.Join(outputDir, "ginkgo") ginkgo := filepath.Join(outputDir, "ginkgo")
test := filepath.Join(outputDir, "e2e_node.test") test := filepath.Join(outputDir, "e2e_node.test")
runCommand(ginkgo, *ginkgoFlags, test, "--", *testFlags) runCommand(ginkgo, *ginkgoFlags, test, "--", *testFlags)
@ -54,6 +55,7 @@ func main() {
} }
func runCommand(name string, args ...string) error { func runCommand(name string, args ...string) error {
glog.Infof("Running command: %v %v", name, strings.Join(args, " "))
cmd := exec.Command("sh", "-c", strings.Join(append([]string{name}, args...), " ")) cmd := exec.Command("sh", "-c", strings.Join(append([]string{name}, args...), " "))
cmd.Stdout = os.Stdout cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr cmd.Stderr = os.Stderr

View File

@ -57,7 +57,7 @@ func (a *APIServer) Start() error {
} }
}() }()
err = readinessCheck([]string{apiserverHealthCheckURL}, errCh) err = readinessCheck("apiserver", []string{apiserverHealthCheckURL}, errCh)
if err != nil { if err != nil {
return err return err
} }

View File

@ -118,7 +118,7 @@ func (e *EtcdServer) Start() error {
errCh <- srv.Serve(l) errCh <- srv.Serve(l)
}(e.clientListen) }(e.clientListen)
err = readinessCheck([]string{etcdHealthCheckURL}, errCh) err = readinessCheck("etcd", []string{etcdHealthCheckURL}, errCh)
if err != nil { if err != nil {
return err return err
} }

View File

@ -44,12 +44,13 @@ import (
// TODO(random-liu): Move this file to a separate package. // TODO(random-liu): Move this file to a separate package.
var serverStartTimeout = flag.Duration("server-start-timeout", time.Second*120, "Time to wait for each server to become healthy.") var serverStartTimeout = flag.Duration("server-start-timeout", time.Second*120, "Time to wait for each server to become healthy.")
// E2EServices starts and stops e2e services in a separate process. The test uses it to start and // E2EServices starts and stops e2e services in a separate process. The test
// stop all e2e services. // uses it to start and stop all e2e services.
type E2EServices struct { type E2EServices struct {
services *server services *server
} }
// NewE2EServices returns a new E2EServices instance.
func NewE2EServices() *E2EServices { func NewE2EServices() *E2EServices {
return &E2EServices{} return &E2EServices{}
} }
@ -57,11 +58,12 @@ func NewE2EServices() *E2EServices {
// services.log is the combined log of all services // services.log is the combined log of all services
const servicesLogFile = "services.log" const servicesLogFile = "services.log"
// Start starts the e2e services in another process, it returns when all e2e // Start starts the e2e services in another process by calling back into the
// services are ready. // test binary. Returns when all e2e services are ready or an error.
//
// We want to statically link e2e services into the test binary, but we don't // We want to statically link e2e services into the test binary, but we don't
// want their glog to pollute the test result. So we run the binary in run- // want their glog output to pollute the test result. So we run the binary in
// services-mode to start e2e services in another process. // run- services-mode to start e2e services in another process.
func (e *E2EServices) Start() error { func (e *E2EServices) Start() error {
var err error var err error
// Create the manifest path for kubelet. // Create the manifest path for kubelet.
@ -155,18 +157,20 @@ func getHealthCheckURLs() []string {
} }
} }
// e2eService is used internally in this file to start e2e services in current process. // e2eService manages e2e services in current process.
type e2eService struct { type e2eService struct {
services []*server services []*server
rmDirs []string rmDirs []string
logFiles map[string]logFileData logFiles map[string]logFileData
// All statically linked e2e services // statically linked e2e services
etcdServer *EtcdServer etcdServer *EtcdServer
apiServer *APIServer apiServer *APIServer
nsController *NamespaceController nsController *NamespaceController
} }
// logFileData holds data about logfiles to fetch with a journalctl command or
// symlink from a node's file system.
type logFileData struct { type logFileData struct {
files []string files []string
journalctlCommand []string journalctlCommand []string
@ -206,6 +210,7 @@ func (es *e2eService) run() error {
return nil return nil
} }
// start starts the tests embedded services or returns an error.
func (es *e2eService) start() error { func (es *e2eService) start() error {
glog.Info("Starting e2e services...") glog.Info("Starting e2e services...")
err := es.startEtcd() err := es.startEtcd()
@ -218,7 +223,7 @@ func (es *e2eService) start() error {
return err return err
} }
s, err := es.startKubeletServer() s, err := es.startKubelet()
if err != nil { if err != nil {
return err return err
} }
@ -232,8 +237,9 @@ func (es *e2eService) start() error {
return nil return nil
} }
// Get logs of interest either via journalctl or by creating sym links. // getLogFiles gets logs of interest either via journalctl or by creating sym
// Since we scp files from the remote directory, symlinks will be treated as normal files and file contents will be copied over. // links. Since we scp files from the remote directory, symlinks will be
// treated as normal files and file contents will be copied over.
func (es *e2eService) getLogFiles() { func (es *e2eService) getLogFiles() {
// Nothing to do if report dir is not specified. // Nothing to do if report dir is not specified.
if framework.TestContext.ReportDir == "" { if framework.TestContext.ReportDir == "" {
@ -244,7 +250,7 @@ func (es *e2eService) getLogFiles() {
for targetFileName, logFileData := range es.logFiles { for targetFileName, logFileData := range es.logFiles {
targetLink := path.Join(framework.TestContext.ReportDir, targetFileName) targetLink := path.Join(framework.TestContext.ReportDir, targetFileName)
if journaldFound { if journaldFound {
// Skip log files that do not have an equivalent in journald based machines. // Skip log files that do not have an equivalent in journald-based machines.
if len(logFileData.journalctlCommand) == 0 { if len(logFileData.journalctlCommand) == 0 {
continue continue
} }
@ -273,59 +279,55 @@ func (es *e2eService) getLogFiles() {
} }
} }
func copyLogFile(src, target string) error { // stop stops the embedded e2e services.
// If not a journald based distro, then just symlink files.
if out, err := exec.Command("sudo", "cp", src, target).CombinedOutput(); err != nil {
return fmt.Errorf("failed to copy %q to %q: %v, %v", src, target, out, err)
}
if out, err := exec.Command("sudo", "chmod", "a+r", target).CombinedOutput(); err != nil {
return fmt.Errorf("failed to make log file %q world readable: %v, %v", target, out, err)
}
return nil
}
func isJournaldAvailable() bool {
_, err := exec.LookPath("journalctl")
return err == nil
}
func (es *e2eService) stop() { func (es *e2eService) stop() {
glog.Info("Stopping e2e services...") glog.Info("Stopping e2e services...")
es.getLogFiles() es.getLogFiles()
// TODO(random-liu): Use a loop to stop all services after introducing service interface.
// Stop namespace controller // TODO(random-liu): Use a loop to stop all services after introducing
// service interface.
glog.Info("Stopping namespace controller")
if es.nsController != nil { if es.nsController != nil {
if err := es.nsController.Stop(); err != nil { if err := es.nsController.Stop(); err != nil {
glog.Errorf("Failed to stop %q: %v", es.nsController.Name(), err) glog.Errorf("Failed to stop %q: %v", es.nsController.Name(), err)
} }
} }
// Stop apiserver
glog.Info("Stopping API server")
if es.apiServer != nil { if es.apiServer != nil {
if err := es.apiServer.Stop(); err != nil { if err := es.apiServer.Stop(); err != nil {
glog.Errorf("Failed to stop %q: %v", es.apiServer.Name(), err) glog.Errorf("Failed to stop %q: %v", es.apiServer.Name(), err)
} }
} }
for _, s := range es.services { for _, s := range es.services {
glog.Info("Stopping service %q", s.name)
if err := s.kill(); err != nil { if err := s.kill(); err != nil {
glog.Errorf("Failed to stop %v: %v", s.name, err) glog.Errorf("Failed to stop %v: %v", s.name, err)
} }
} }
// Stop etcd
glog.Info("Stopping etcd")
if es.etcdServer != nil { if es.etcdServer != nil {
if err := es.etcdServer.Stop(); err != nil { if err := es.etcdServer.Stop(); err != nil {
glog.Errorf("Failed to stop %q: %v", es.etcdServer.Name(), err) glog.Errorf("Failed to stop %q: %v", es.etcdServer.Name(), err)
} }
} }
for _, d := range es.rmDirs { for _, d := range es.rmDirs {
glog.Info("Deleting directory %v", d)
err := os.RemoveAll(d) err := os.RemoveAll(d)
if err != nil { if err != nil {
glog.Errorf("Failed to delete directory %s.\n%v", d, err) glog.Errorf("Failed to delete directory %s.\n%v", d, err)
} }
} }
glog.Info("E2E services stopped.") glog.Info("E2E services stopped.")
} }
// startEtcd starts the embedded etcd instance or returns an error.
func (es *e2eService) startEtcd() error { func (es *e2eService) startEtcd() error {
glog.Info("Starting etcd")
dataDir, err := ioutil.TempDir("", "node-e2e") dataDir, err := ioutil.TempDir("", "node-e2e")
if err != nil { if err != nil {
return err return err
@ -336,17 +338,24 @@ func (es *e2eService) startEtcd() error {
return es.etcdServer.Start() return es.etcdServer.Start()
} }
// startApiServer starts the embedded API server or returns an error.
func (es *e2eService) startApiServer() error { func (es *e2eService) startApiServer() error {
glog.Info("Starting API server")
es.apiServer = NewAPIServer() es.apiServer = NewAPIServer()
return es.apiServer.Start() return es.apiServer.Start()
} }
// startNamespaceController starts the embedded namespace controller or returns an error.
func (es *e2eService) startNamespaceController() error { func (es *e2eService) startNamespaceController() error {
glog.Info("Starting namespace controller")
es.nsController = NewNamespaceController() es.nsController = NewNamespaceController()
return es.nsController.Start() return es.nsController.Start()
} }
func (es *e2eService) startKubeletServer() (*server, error) { // startKubelet starts the Kubelet in a separate process or returns an error
// if the Kubelet fails to start.
func (es *e2eService) startKubelet() (*server, error) {
glog.Info("Starting kubelet")
var killCommand, restartCommand *exec.Cmd var killCommand, restartCommand *exec.Cmd
cmdArgs := []string{} cmdArgs := []string{}
if systemdRun, err := exec.LookPath("systemd-run"); err == nil { if systemdRun, err := exec.LookPath("systemd-run"); err == nil {
@ -417,11 +426,12 @@ func (es *e2eService) startKubeletServer() (*server, error) {
restartCommand, restartCommand,
[]string{kubeletHealthCheckURL}, []string{kubeletHealthCheckURL},
"kubelet.log", "kubelet.log",
true) true /* restartOnExit */)
return server, server.start() return server, server.start()
} }
// server manages a server started and killed with commands. // A server manages a separate server process started and killed with
// commands.
type server struct { type server struct {
// name is the name of the server, it is only used for logging. // name is the name of the server, it is only used for logging.
name string name string
@ -447,14 +457,16 @@ type server struct {
ackStopRestartingCh <-chan bool ackStopRestartingCh <-chan bool
} }
func newServer(name string, start, kill, restart *exec.Cmd, urls []string, filename string, restartOnExit bool) *server { // newServer returns a new server with the given name, commands, health check
// URLs, etc.
func newServer(name string, start, kill, restart *exec.Cmd, urls []string, outputFileName string, restartOnExit bool) *server {
return &server{ return &server{
name: name, name: name,
startCommand: start, startCommand: start,
killCommand: kill, killCommand: kill,
restartCommand: restart, restartCommand: restart,
healthCheckUrls: urls, healthCheckUrls: urls,
outFilename: filename, outFilename: outputFileName,
restartOnExit: restartOnExit, restartOnExit: restartOnExit,
} }
} }
@ -472,54 +484,15 @@ func (s *server) String() string {
commandToString(s.startCommand), commandToString(s.killCommand), commandToString(s.restartCommand), s.healthCheckUrls, s.outFilename) commandToString(s.startCommand), commandToString(s.killCommand), commandToString(s.restartCommand), s.healthCheckUrls, s.outFilename)
} }
// readinessCheck checks whether services are ready via the health check urls. Once there is // start starts the server by running its commands, monitors it with a health
// an error in errCh, the function will stop waiting and return the error. // check, and ensures that it is restarted if applicable.
// TODO(random-liu): Move this to util //
func readinessCheck(urls []string, errCh <-chan error) error {
endTime := time.Now().Add(*serverStartTimeout)
blockCh := make(chan error)
defer close(blockCh)
for endTime.After(time.Now()) {
select {
// We *always* want to run the health check if there is no error on the channel.
// With systemd, reads from errCh report nil because cmd.Run() waits
// on systemd-run, rather than the service process. systemd-run quickly
// exits with status 0, causing the channel to be closed with no error. In
// this case, you want to wait for the health check to complete, rather
// than returning from readinessCheck as soon as the channel is closed.
case err, ok := <-errCh:
if ok { // The channel is not closed, this is a real error
if err != nil { // If there is an error, return it
return err
}
// If not, keep checking readiness.
} else { // The channel is closed, this is only a zero value.
// Replace the errCh with blockCh to avoid busy loop,
// and keep checking readiness.
errCh = blockCh
}
case <-time.After(time.Second):
ready := true
for _, url := range urls {
resp, err := http.Head(url)
if err != nil || resp.StatusCode != http.StatusOK {
ready = false
break
}
}
if ready {
return nil
}
}
}
return fmt.Errorf("e2e service readiness check timeout %v", *serverStartTimeout)
}
// Note: restartOnExit == true requires len(s.healthCheckUrls) > 0 to work properly. // Note: restartOnExit == true requires len(s.healthCheckUrls) > 0 to work properly.
func (s *server) start() error { func (s *server) start() error {
glog.Infof("Start server %q with command %q", s.name, commandToString(s.startCommand)) glog.Infof("Starting server %q with command %q", s.name, commandToString(s.startCommand))
errCh := make(chan error) errCh := make(chan error)
// Set up restart channels if the server is configured for restart on exit.
var stopRestartingCh, ackStopRestartingCh chan bool var stopRestartingCh, ackStopRestartingCh chan bool
if s.restartOnExit { if s.restartOnExit {
if len(s.healthCheckUrls) == 0 { if len(s.healthCheckUrls) == 0 {
@ -533,6 +506,7 @@ func (s *server) start() error {
s.ackStopRestartingCh = ackStopRestartingCh s.ackStopRestartingCh = ackStopRestartingCh
} }
// This goroutine actually runs the start command for the server.
go func() { go func() {
defer close(errCh) defer close(errCh)
@ -542,6 +516,8 @@ func (s *server) start() error {
if err != nil { if err != nil {
errCh <- fmt.Errorf("failed to create file %q for `%s` %v.", outPath, s, err) errCh <- fmt.Errorf("failed to create file %q for `%s` %v.", outPath, s, err)
return return
} else {
glog.Infof("Output file for server %q: %v", s.name, outfile.Name())
} }
defer outfile.Close() defer outfile.Close()
defer outfile.Sync() defer outfile.Sync()
@ -569,25 +545,29 @@ func (s *server) start() error {
return return
} }
if !s.restartOnExit { if !s.restartOnExit {
glog.Infof("Waiting for server %q start command to complete", s.name)
// If we aren't planning on restarting, ok to Wait() here to release resources. // If we aren't planning on restarting, ok to Wait() here to release resources.
// Otherwise, we Wait() in the restart loop. // Otherwise, we Wait() in the restart loop.
err = s.startCommand.Wait() err = s.startCommand.Wait()
if err != nil { if err != nil {
errCh <- fmt.Errorf("failed to run %s: %v", s, err) errCh <- fmt.Errorf("failed to run start command for server %q: %v", s.name, err)
return return
} }
} else { } else {
// New stuff
usedStartCmd := true usedStartCmd := true
for { for {
glog.Infof("Running health check for service %q", s.name)
// Wait for an initial health check to pass, so that we are sure the server started. // Wait for an initial health check to pass, so that we are sure the server started.
err := readinessCheck(s.healthCheckUrls, nil) err := readinessCheck(s.name, s.healthCheckUrls, nil)
if err != nil { if err != nil {
if usedStartCmd { if usedStartCmd {
glog.Infof("Waiting for server %q start command to complete after initial health check failed", s.name)
s.startCommand.Wait() // Release resources if necessary. s.startCommand.Wait() // Release resources if necessary.
} }
// This should not happen, immediately stop the e2eService process. // This should not happen, immediately stop the e2eService process.
glog.Fatalf("restart loop readinessCheck failed for %s", s) glog.Fatalf("restart loop readinessCheck failed for %s", s)
} else {
glog.Infof("Initial health check passed for service %q", s.name)
} }
// Initial health check passed, wait until a health check fails again. // Initial health check passed, wait until a health check fails again.
@ -612,7 +592,8 @@ func (s *server) start() error {
usedStartCmd = false usedStartCmd = false
} }
if s.restartCommand != nil { if s.restartCommand != nil {
// Always make a fresh copy of restartCommand before running, we may have to restart multiple times // Always make a fresh copy of restartCommand before
// running, we may have to restart multiple times
s.restartCommand = &exec.Cmd{ s.restartCommand = &exec.Cmd{
Path: s.restartCommand.Path, Path: s.restartCommand.Path,
Args: s.restartCommand.Args, Args: s.restartCommand.Args,
@ -624,10 +605,13 @@ func (s *server) start() error {
ExtraFiles: s.restartCommand.ExtraFiles, ExtraFiles: s.restartCommand.ExtraFiles,
SysProcAttr: s.restartCommand.SysProcAttr, SysProcAttr: s.restartCommand.SysProcAttr,
} }
err = s.restartCommand.Run() // Run and wait for exit. This command is assumed to have short duration, e.g. systemctl restart // Run and wait for exit. This command is assumed to have
// short duration, e.g. systemctl restart
glog.Infof("Restarting server %q with restart command", s.name)
err = s.restartCommand.Run()
if err != nil { if err != nil {
// This should not happen, immediately stop the e2eService process. // This should not happen, immediately stop the e2eService process.
glog.Fatalf("restarting %s with restartCommand failed. Error: %v.", s, err) glog.Fatalf("restarting server %s with restartCommand failed. Error: %v.", s, err)
} }
} else { } else {
s.startCommand = &exec.Cmd{ s.startCommand = &exec.Cmd{
@ -641,20 +625,22 @@ func (s *server) start() error {
ExtraFiles: s.startCommand.ExtraFiles, ExtraFiles: s.startCommand.ExtraFiles,
SysProcAttr: s.startCommand.SysProcAttr, SysProcAttr: s.startCommand.SysProcAttr,
} }
glog.Infof("Restarting server %q with start command", s.name)
err = s.startCommand.Start() err = s.startCommand.Start()
usedStartCmd = true usedStartCmd = true
if err != nil { if err != nil {
// This should not happen, immediately stop the e2eService process. // This should not happen, immediately stop the e2eService process.
glog.Fatalf("restarting %s with startCommand failed. Error: %v.", s, err) glog.Fatalf("Restarting %s with startCommand failed. Error: %v.", s, err)
} }
} }
} }
} }
}() }()
return readinessCheck(s.healthCheckUrls, errCh) return readinessCheck(s.name, s.healthCheckUrls, errCh)
} }
// kill runs the server's kill command.
func (s *server) kill() error { func (s *server) kill() error {
glog.Infof("Kill server %q", s.name) glog.Infof("Kill server %q", s.name)
name := s.name name := s.name
@ -731,3 +717,66 @@ func (s *server) kill() error {
func adjustConfigForSystemd(config string) string { func adjustConfigForSystemd(config string) string {
return strings.Replace(config, "%", "%%", -1) return strings.Replace(config, "%", "%%", -1)
} }
func copyLogFile(src, target string) error {
// If not a journald based distro, then just symlink files.
if out, err := exec.Command("sudo", "cp", src, target).CombinedOutput(); err != nil {
return fmt.Errorf("failed to copy %q to %q: %v, %v", src, target, out, err)
}
if out, err := exec.Command("sudo", "chmod", "a+r", target).CombinedOutput(); err != nil {
return fmt.Errorf("failed to make log file %q world readable: %v, %v", target, out, err)
}
return nil
}
// isJournaldAvailable returns whether the system executing the tests uses
// journald.
func isJournaldAvailable() bool {
_, err := exec.LookPath("journalctl")
return err == nil
}
// readinessCheck checks whether services are ready via the supplied health
// check URLs. Once there is an error in errCh, the function will stop waiting
// and return the error.
// TODO(random-liu): Move this to util
func readinessCheck(name string, urls []string, errCh <-chan error) error {
glog.Infof("Running readiness check for service %q", name)
endTime := time.Now().Add(*serverStartTimeout)
blockCh := make(chan error)
defer close(blockCh)
for endTime.After(time.Now()) {
select {
// We *always* want to run the health check if there is no error on the channel.
// With systemd, reads from errCh report nil because cmd.Run() waits
// on systemd-run, rather than the service process. systemd-run quickly
// exits with status 0, causing the channel to be closed with no error. In
// this case, you want to wait for the health check to complete, rather
// than returning from readinessCheck as soon as the channel is closed.
case err, ok := <-errCh:
if ok { // The channel is not closed, this is a real error
if err != nil { // If there is an error, return it
return err
}
// If not, keep checking readiness.
} else { // The channel is closed, this is only a zero value.
// Replace the errCh with blockCh to avoid busy loop,
// and keep checking readiness.
errCh = blockCh
}
case <-time.After(time.Second):
ready := true
for _, url := range urls {
resp, err := http.Head(url)
if err != nil || resp.StatusCode != http.StatusOK {
ready = false
break
}
}
if ready {
return nil
}
}
}
return fmt.Errorf("e2e service %q readiness check timeout %v", name, *serverStartTimeout)
}