Merge pull request #27167 from euank/fix-node-e2e-coreos-kubelet-cgroup-detection

Automatic merge from submit-queue

Fix node e2e coreos kubelet cgroup detection

Fixes #26979 #26431 

The root issue, as best I can tell, is that cgroup detection does not work when the kubelet is started under an ssh session and the systemd `*Accounting` variables are set. I added additional logging and noted some differences in the cgroup slice names between those cadvisor returns and the kubelet detects for itself.
This difference does not occur if the kubelet is properly running under a unit. That environment is also a more common and sane environment.

See also discussion in #26903

cc @derekwaynecarr @vishh @pwittrock
This commit is contained in:
k8s-merge-robot 2016-06-13 11:54:05 -07:00 committed by GitHub
commit d9f788532f
4 changed files with 73 additions and 34 deletions

View File

@ -20,6 +20,7 @@ import (
"flag"
"fmt"
"io/ioutil"
"math/rand"
"net/http"
"os"
"os/exec"
@ -37,16 +38,28 @@ var serverStartTimeout = flag.Duration("server-start-timeout", time.Second*120,
var reportDir = flag.String("report-dir", "", "Path to the directory where the JUnit XML reports should be saved. Default is empty, which doesn't generate these reports.")
type e2eService struct {
etcdCmd *exec.Cmd
etcdCmd *killCmd
etcdDataDir string
apiServerCmd *exec.Cmd
kubeletCmd *exec.Cmd
apiServerCmd *killCmd
kubeletCmd *killCmd
kubeletStaticPodDir string
nodeName string
logFiles map[string]logFileData
}
type logFileData struct {
files []string
journalctlCommand []string
}
func newE2eService(nodeName string) *e2eService {
return &e2eService{nodeName: nodeName}
// Special log files that need to be collected for additional debugging.
var logFiles = map[string]logFileData{
"kern.log": {[]string{"/var/log/kern.log"}, []string{"-k"}},
"docker.log": {[]string{"/var/log/docker.log", "/var/log/upstart/docker.log"}, []string{"-u", "docker"}},
}
return &e2eService{nodeName: nodeName, logFiles: logFiles}
}
func (es *e2eService) start() error {
@ -81,22 +94,12 @@ func (es *e2eService) start() error {
// Get logs of interest either via journalctl or by creating sym links.
// Since we scp files from the remote directory, symlinks will be treated as normal files and file contents will be copied over.
func (es *e2eService) getLogFiles() {
// Special log files that need to be collected for additional debugging.
type logFileData struct {
files []string
journalctlCommand []string
}
var logFiles = map[string]logFileData{
"kern.log": {[]string{"/var/log/kern.log"}, []string{"-k"}},
"docker.log": {[]string{"/var/log/docker.log", "/var/log/upstart/docker.log"}, []string{"-u", "docker"}},
}
// Nothing to do if report dir is not specified.
if *reportDir == "" {
return
}
journaldFound := isJournaldAvailable()
for targetFileName, logFileData := range logFiles {
for targetFileName, logFileData := range es.logFiles {
targetLink := path.Join(*reportDir, targetFileName)
if journaldFound {
// Skip log files that do not have an equivalent in journald based machines.
@ -144,7 +147,7 @@ func isJournaldAvailable() bool {
}
func (es *e2eService) stop() {
if err := es.stopService("kubelet", es.kubeletCmd); err != nil {
if err := es.stopService(es.kubeletCmd); err != nil {
glog.Errorf("Failed to stop kubelet: %v", err)
}
if es.kubeletStaticPodDir != "" {
@ -153,10 +156,10 @@ func (es *e2eService) stop() {
glog.Errorf("Failed to delete kubelet static pod directory %s.\n%v", es.kubeletStaticPodDir, err)
}
}
if err := es.stopService("kube-apiserver", es.apiServerCmd); err != nil {
if err := es.stopService(es.apiServerCmd); err != nil {
glog.Errorf("Failed to stop kube-apiserver: %v", err)
}
if err := es.stopService("etcd", es.etcdCmd); err != nil {
if err := es.stopService(es.etcdCmd); err != nil {
glog.Errorf("Failed to stop etcd: %v", err)
}
if es.etcdDataDir != "" {
@ -167,7 +170,7 @@ func (es *e2eService) stop() {
}
}
func (es *e2eService) startEtcd() (*exec.Cmd, error) {
func (es *e2eService) startEtcd() (*killCmd, error) {
dataDir, err := ioutil.TempDir("", "node-e2e")
if err != nil {
return nil, err
@ -181,10 +184,10 @@ func (es *e2eService) startEtcd() (*exec.Cmd, error) {
"http://127.0.0.1:4001/v2/keys/", // Trailing slash is required,
cmd,
"etcd.log")
return cmd, es.startServer(hcc)
return &killCmd{name: "etcd", cmd: cmd}, es.startServer(hcc)
}
func (es *e2eService) startApiServer() (*exec.Cmd, error) {
func (es *e2eService) startApiServer() (*killCmd, error) {
cmd := exec.Command("sudo", getApiServerBin(),
"--etcd-servers", "http://127.0.0.1:4001",
"--insecure-bind-address", "0.0.0.0",
@ -197,16 +200,32 @@ func (es *e2eService) startApiServer() (*exec.Cmd, error) {
"http://127.0.0.1:8080/healthz",
cmd,
"kube-apiserver.log")
return cmd, es.startServer(hcc)
return &killCmd{name: "kube-apiserver", cmd: cmd}, es.startServer(hcc)
}
func (es *e2eService) startKubeletServer() (*exec.Cmd, error) {
func (es *e2eService) startKubeletServer() (*killCmd, error) {
dataDir, err := ioutil.TempDir("", "node-e2e-pod")
if err != nil {
return nil, err
}
es.kubeletStaticPodDir = dataDir
cmd := exec.Command("sudo", getKubeletServerBin(),
var killOverride *exec.Cmd
cmdArgs := []string{}
if systemdRun, err := exec.LookPath("systemd-run"); err == nil {
// On systemd services, detection of a service / unit works reliably while
// detection of a process started from an ssh session does not work.
// Since kubelet will typically be run as a service it also makes more
// sense to test it that way
unitName := fmt.Sprintf("kubelet-%d.service", rand.Int31())
cmdArgs = append(cmdArgs, systemdRun, "--unit="+unitName, getKubeletServerBin())
killOverride = exec.Command("sudo", "systemctl", "kill", unitName)
es.logFiles["kubelet.log"] = logFileData{
journalctlCommand: []string{"-u", unitName},
}
} else {
cmdArgs = append(cmdArgs, getKubeletServerBin())
}
cmdArgs = append(cmdArgs,
"--api-servers", "http://127.0.0.1:8080",
"--address", "0.0.0.0",
"--port", "10250",
@ -218,11 +237,12 @@ func (es *e2eService) startKubeletServer() (*exec.Cmd, error) {
"--file-check-frequency", "10s", // Check file frequently so tests won't wait too long
"--v", "8", "--logtostderr",
)
cmd := exec.Command("sudo", cmdArgs...)
hcc := newHealthCheckCommand(
"http://127.0.0.1:10255/healthz",
cmd,
"kubelet.log")
return cmd, es.startServer(hcc)
return &killCmd{name: "kubelet", cmd: cmd, override: killOverride}, es.startServer(hcc)
}
func (es *e2eService) startServer(cmd *healthCheckCommand) error {
@ -279,7 +299,27 @@ func (es *e2eService) startServer(cmd *healthCheckCommand) error {
return fmt.Errorf("Timeout waiting for service %s", cmd)
}
func (es *e2eService) stopService(name string, cmd *exec.Cmd) error {
func (es *e2eService) stopService(cmd *killCmd) error {
return cmd.Kill()
}
// killCmd is a struct to kill a given cmd. The cmd member specifies a command
// to find the pid of and attempt to kill.
// If the override field is set, that will be used instead to kill the command.
// name is only used for logging
type killCmd struct {
name string
cmd *exec.Cmd
override *exec.Cmd
}
func (k *killCmd) Kill() error {
if k.override != nil {
return k.override.Run()
}
name := k.name
cmd := k.cmd
if cmd == nil || cmd.Process == nil {
glog.V(2).Infof("%s not running", name)
return nil

View File

@ -41,6 +41,11 @@ EOF
sudo systemctl daemon-reload
fi
# For coreos, disable updates
if $(sudo systemctl status update-engine &>/dev/null); then
sudo systemctl mask update-engine locksmithd
fi
# Fixup sudoers require tty
sudo grep -q "# Defaults requiretty" /etc/sudoers
if [ $? -ne 0 ] ; then

View File

@ -3,10 +3,7 @@ GCE_HOSTS=
# To copy an image between projects:
# `gcloud compute --project <to-project> disks create <image name> --image=https://www.googleapis.com/compute/v1/projects/<from-project>/global/images/<image-name>`
# `gcloud compute --project <to-project> images create <image-name> --source-disk=<image-name>`
#
# Testing disabled on the following images:
# e2e-node-coreos-stable20160531-image - Github Issue #26903
GCE_IMAGES=e2e-node-ubuntu-trusty-docker10-image,e2e-node-ubuntu-trusty-docker9-image,e2e-node-ubuntu-trusty-docker8-image,e2e-node-containervm-v20160321-image
GCE_IMAGES=e2e-node-ubuntu-trusty-docker10-image,e2e-node-ubuntu-trusty-docker9-image,e2e-node-ubuntu-trusty-docker8-image,e2e-node-coreos-stable20160531-image,e2e-node-containervm-v20160321-image
GCE_ZONE=us-central1-f
GCE_PROJECT=kubernetes-jenkins
GCE_IMAGE_PROJECT=kubernetes-jenkins

View File

@ -3,10 +3,7 @@ GCE_HOSTS=
# To copy an image between projects:
# `gcloud compute --project <to-project> disks create <image name> --image=https://www.googleapis.com/compute/v1/projects/<from-project>/global/images/<image-name>`
# `gcloud compute --project <to-project> images create <image-name> --source-disk=<image-name>`
#
# Testing disabled on the following images:
# e2e-node-coreos-stable20160531-image - Github Issue #26903
GCE_IMAGES=e2e-node-ubuntu-trusty-docker10-image,e2e-node-ubuntu-trusty-docker9-image,e2e-node-ubuntu-trusty-docker8-image,e2e-node-containervm-v20160321-image
GCE_IMAGES=e2e-node-ubuntu-trusty-docker10-image,e2e-node-ubuntu-trusty-docker9-image,e2e-node-ubuntu-trusty-docker8-image,e2e-node-coreos-stable20160531-image,e2e-node-containervm-v20160321-image
GCE_ZONE=us-central1-f
GCE_PROJECT=kubernetes-jenkins-pull
GCE_IMAGE_PROJECT=kubernetes-jenkins-pull