Merge pull request #27167 from euank/fix-node-e2e-coreos-kubelet-cgroup-detection

Automatic merge from submit-queue

Fix node e2e coreos kubelet cgroup detection

Fixes #26979 #26431 

The root issue, as best I can tell, is that cgroup detection does not work when the kubelet is started under an ssh session and the systemd `*Accounting` variables are set. I added additional logging and noted some differences in the cgroup slice names between those cadvisor returns and the kubelet detects for itself.
This difference does not occur if the kubelet is properly running under a unit. That environment is also a more common and sane environment.

See also discussion in #26903

cc @derekwaynecarr @vishh @pwittrock
This commit is contained in:
k8s-merge-robot 2016-06-13 11:54:05 -07:00 committed by GitHub
commit d9f788532f
4 changed files with 73 additions and 34 deletions

View File

@ -20,6 +20,7 @@ import (
"flag" "flag"
"fmt" "fmt"
"io/ioutil" "io/ioutil"
"math/rand"
"net/http" "net/http"
"os" "os"
"os/exec" "os/exec"
@ -37,16 +38,28 @@ var serverStartTimeout = flag.Duration("server-start-timeout", time.Second*120,
var reportDir = flag.String("report-dir", "", "Path to the directory where the JUnit XML reports should be saved. Default is empty, which doesn't generate these reports.") var reportDir = flag.String("report-dir", "", "Path to the directory where the JUnit XML reports should be saved. Default is empty, which doesn't generate these reports.")
type e2eService struct { type e2eService struct {
etcdCmd *exec.Cmd etcdCmd *killCmd
etcdDataDir string etcdDataDir string
apiServerCmd *exec.Cmd apiServerCmd *killCmd
kubeletCmd *exec.Cmd kubeletCmd *killCmd
kubeletStaticPodDir string kubeletStaticPodDir string
nodeName string nodeName string
logFiles map[string]logFileData
}
type logFileData struct {
files []string
journalctlCommand []string
} }
func newE2eService(nodeName string) *e2eService { func newE2eService(nodeName string) *e2eService {
return &e2eService{nodeName: nodeName} // Special log files that need to be collected for additional debugging.
var logFiles = map[string]logFileData{
"kern.log": {[]string{"/var/log/kern.log"}, []string{"-k"}},
"docker.log": {[]string{"/var/log/docker.log", "/var/log/upstart/docker.log"}, []string{"-u", "docker"}},
}
return &e2eService{nodeName: nodeName, logFiles: logFiles}
} }
func (es *e2eService) start() error { func (es *e2eService) start() error {
@ -81,22 +94,12 @@ func (es *e2eService) start() error {
// Get logs of interest either via journalctl or by creating sym links. // Get logs of interest either via journalctl or by creating sym links.
// Since we scp files from the remote directory, symlinks will be treated as normal files and file contents will be copied over. // Since we scp files from the remote directory, symlinks will be treated as normal files and file contents will be copied over.
func (es *e2eService) getLogFiles() { func (es *e2eService) getLogFiles() {
// Special log files that need to be collected for additional debugging.
type logFileData struct {
files []string
journalctlCommand []string
}
var logFiles = map[string]logFileData{
"kern.log": {[]string{"/var/log/kern.log"}, []string{"-k"}},
"docker.log": {[]string{"/var/log/docker.log", "/var/log/upstart/docker.log"}, []string{"-u", "docker"}},
}
// Nothing to do if report dir is not specified. // Nothing to do if report dir is not specified.
if *reportDir == "" { if *reportDir == "" {
return return
} }
journaldFound := isJournaldAvailable() journaldFound := isJournaldAvailable()
for targetFileName, logFileData := range logFiles { for targetFileName, logFileData := range es.logFiles {
targetLink := path.Join(*reportDir, targetFileName) targetLink := path.Join(*reportDir, targetFileName)
if journaldFound { if journaldFound {
// Skip log files that do not have an equivalent in journald based machines. // Skip log files that do not have an equivalent in journald based machines.
@ -144,7 +147,7 @@ func isJournaldAvailable() bool {
} }
func (es *e2eService) stop() { func (es *e2eService) stop() {
if err := es.stopService("kubelet", es.kubeletCmd); err != nil { if err := es.stopService(es.kubeletCmd); err != nil {
glog.Errorf("Failed to stop kubelet: %v", err) glog.Errorf("Failed to stop kubelet: %v", err)
} }
if es.kubeletStaticPodDir != "" { if es.kubeletStaticPodDir != "" {
@ -153,10 +156,10 @@ func (es *e2eService) stop() {
glog.Errorf("Failed to delete kubelet static pod directory %s.\n%v", es.kubeletStaticPodDir, err) glog.Errorf("Failed to delete kubelet static pod directory %s.\n%v", es.kubeletStaticPodDir, err)
} }
} }
if err := es.stopService("kube-apiserver", es.apiServerCmd); err != nil { if err := es.stopService(es.apiServerCmd); err != nil {
glog.Errorf("Failed to stop kube-apiserver: %v", err) glog.Errorf("Failed to stop kube-apiserver: %v", err)
} }
if err := es.stopService("etcd", es.etcdCmd); err != nil { if err := es.stopService(es.etcdCmd); err != nil {
glog.Errorf("Failed to stop etcd: %v", err) glog.Errorf("Failed to stop etcd: %v", err)
} }
if es.etcdDataDir != "" { if es.etcdDataDir != "" {
@ -167,7 +170,7 @@ func (es *e2eService) stop() {
} }
} }
func (es *e2eService) startEtcd() (*exec.Cmd, error) { func (es *e2eService) startEtcd() (*killCmd, error) {
dataDir, err := ioutil.TempDir("", "node-e2e") dataDir, err := ioutil.TempDir("", "node-e2e")
if err != nil { if err != nil {
return nil, err return nil, err
@ -181,10 +184,10 @@ func (es *e2eService) startEtcd() (*exec.Cmd, error) {
"http://127.0.0.1:4001/v2/keys/", // Trailing slash is required, "http://127.0.0.1:4001/v2/keys/", // Trailing slash is required,
cmd, cmd,
"etcd.log") "etcd.log")
return cmd, es.startServer(hcc) return &killCmd{name: "etcd", cmd: cmd}, es.startServer(hcc)
} }
func (es *e2eService) startApiServer() (*exec.Cmd, error) { func (es *e2eService) startApiServer() (*killCmd, error) {
cmd := exec.Command("sudo", getApiServerBin(), cmd := exec.Command("sudo", getApiServerBin(),
"--etcd-servers", "http://127.0.0.1:4001", "--etcd-servers", "http://127.0.0.1:4001",
"--insecure-bind-address", "0.0.0.0", "--insecure-bind-address", "0.0.0.0",
@ -197,16 +200,32 @@ func (es *e2eService) startApiServer() (*exec.Cmd, error) {
"http://127.0.0.1:8080/healthz", "http://127.0.0.1:8080/healthz",
cmd, cmd,
"kube-apiserver.log") "kube-apiserver.log")
return cmd, es.startServer(hcc) return &killCmd{name: "kube-apiserver", cmd: cmd}, es.startServer(hcc)
} }
func (es *e2eService) startKubeletServer() (*exec.Cmd, error) { func (es *e2eService) startKubeletServer() (*killCmd, error) {
dataDir, err := ioutil.TempDir("", "node-e2e-pod") dataDir, err := ioutil.TempDir("", "node-e2e-pod")
if err != nil { if err != nil {
return nil, err return nil, err
} }
es.kubeletStaticPodDir = dataDir es.kubeletStaticPodDir = dataDir
cmd := exec.Command("sudo", getKubeletServerBin(), var killOverride *exec.Cmd
cmdArgs := []string{}
if systemdRun, err := exec.LookPath("systemd-run"); err == nil {
// On systemd services, detection of a service / unit works reliably while
// detection of a process started from an ssh session does not work.
// Since kubelet will typically be run as a service it also makes more
// sense to test it that way
unitName := fmt.Sprintf("kubelet-%d.service", rand.Int31())
cmdArgs = append(cmdArgs, systemdRun, "--unit="+unitName, getKubeletServerBin())
killOverride = exec.Command("sudo", "systemctl", "kill", unitName)
es.logFiles["kubelet.log"] = logFileData{
journalctlCommand: []string{"-u", unitName},
}
} else {
cmdArgs = append(cmdArgs, getKubeletServerBin())
}
cmdArgs = append(cmdArgs,
"--api-servers", "http://127.0.0.1:8080", "--api-servers", "http://127.0.0.1:8080",
"--address", "0.0.0.0", "--address", "0.0.0.0",
"--port", "10250", "--port", "10250",
@ -218,11 +237,12 @@ func (es *e2eService) startKubeletServer() (*exec.Cmd, error) {
"--file-check-frequency", "10s", // Check file frequently so tests won't wait too long "--file-check-frequency", "10s", // Check file frequently so tests won't wait too long
"--v", "8", "--logtostderr", "--v", "8", "--logtostderr",
) )
cmd := exec.Command("sudo", cmdArgs...)
hcc := newHealthCheckCommand( hcc := newHealthCheckCommand(
"http://127.0.0.1:10255/healthz", "http://127.0.0.1:10255/healthz",
cmd, cmd,
"kubelet.log") "kubelet.log")
return cmd, es.startServer(hcc) return &killCmd{name: "kubelet", cmd: cmd, override: killOverride}, es.startServer(hcc)
} }
func (es *e2eService) startServer(cmd *healthCheckCommand) error { func (es *e2eService) startServer(cmd *healthCheckCommand) error {
@ -279,7 +299,27 @@ func (es *e2eService) startServer(cmd *healthCheckCommand) error {
return fmt.Errorf("Timeout waiting for service %s", cmd) return fmt.Errorf("Timeout waiting for service %s", cmd)
} }
func (es *e2eService) stopService(name string, cmd *exec.Cmd) error { func (es *e2eService) stopService(cmd *killCmd) error {
return cmd.Kill()
}
// killCmd is a struct to kill a given cmd. The cmd member specifies a command
// to find the pid of and attempt to kill.
// If the override field is set, that will be used instead to kill the command.
// name is only used for logging
type killCmd struct {
name string
cmd *exec.Cmd
override *exec.Cmd
}
func (k *killCmd) Kill() error {
if k.override != nil {
return k.override.Run()
}
name := k.name
cmd := k.cmd
if cmd == nil || cmd.Process == nil { if cmd == nil || cmd.Process == nil {
glog.V(2).Infof("%s not running", name) glog.V(2).Infof("%s not running", name)
return nil return nil

View File

@ -41,6 +41,11 @@ EOF
sudo systemctl daemon-reload sudo systemctl daemon-reload
fi fi
# For coreos, disable updates
if $(sudo systemctl status update-engine &>/dev/null); then
sudo systemctl mask update-engine locksmithd
fi
# Fixup sudoers require tty # Fixup sudoers require tty
sudo grep -q "# Defaults requiretty" /etc/sudoers sudo grep -q "# Defaults requiretty" /etc/sudoers
if [ $? -ne 0 ] ; then if [ $? -ne 0 ] ; then

View File

@ -3,10 +3,7 @@ GCE_HOSTS=
# To copy an image between projects: # To copy an image between projects:
# `gcloud compute --project <to-project> disks create <image name> --image=https://www.googleapis.com/compute/v1/projects/<from-project>/global/images/<image-name>` # `gcloud compute --project <to-project> disks create <image name> --image=https://www.googleapis.com/compute/v1/projects/<from-project>/global/images/<image-name>`
# `gcloud compute --project <to-project> images create <image-name> --source-disk=<image-name>` # `gcloud compute --project <to-project> images create <image-name> --source-disk=<image-name>`
# GCE_IMAGES=e2e-node-ubuntu-trusty-docker10-image,e2e-node-ubuntu-trusty-docker9-image,e2e-node-ubuntu-trusty-docker8-image,e2e-node-coreos-stable20160531-image,e2e-node-containervm-v20160321-image
# Testing disabled on the following images:
# e2e-node-coreos-stable20160531-image - Github Issue #26903
GCE_IMAGES=e2e-node-ubuntu-trusty-docker10-image,e2e-node-ubuntu-trusty-docker9-image,e2e-node-ubuntu-trusty-docker8-image,e2e-node-containervm-v20160321-image
GCE_ZONE=us-central1-f GCE_ZONE=us-central1-f
GCE_PROJECT=kubernetes-jenkins GCE_PROJECT=kubernetes-jenkins
GCE_IMAGE_PROJECT=kubernetes-jenkins GCE_IMAGE_PROJECT=kubernetes-jenkins

View File

@ -3,10 +3,7 @@ GCE_HOSTS=
# To copy an image between projects: # To copy an image between projects:
# `gcloud compute --project <to-project> disks create <image name> --image=https://www.googleapis.com/compute/v1/projects/<from-project>/global/images/<image-name>` # `gcloud compute --project <to-project> disks create <image name> --image=https://www.googleapis.com/compute/v1/projects/<from-project>/global/images/<image-name>`
# `gcloud compute --project <to-project> images create <image-name> --source-disk=<image-name>` # `gcloud compute --project <to-project> images create <image-name> --source-disk=<image-name>`
# GCE_IMAGES=e2e-node-ubuntu-trusty-docker10-image,e2e-node-ubuntu-trusty-docker9-image,e2e-node-ubuntu-trusty-docker8-image,e2e-node-coreos-stable20160531-image,e2e-node-containervm-v20160321-image
# Testing disabled on the following images:
# e2e-node-coreos-stable20160531-image - Github Issue #26903
GCE_IMAGES=e2e-node-ubuntu-trusty-docker10-image,e2e-node-ubuntu-trusty-docker9-image,e2e-node-ubuntu-trusty-docker8-image,e2e-node-containervm-v20160321-image
GCE_ZONE=us-central1-f GCE_ZONE=us-central1-f
GCE_PROJECT=kubernetes-jenkins-pull GCE_PROJECT=kubernetes-jenkins-pull
GCE_IMAGE_PROJECT=kubernetes-jenkins-pull GCE_IMAGE_PROJECT=kubernetes-jenkins-pull