mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-08-07 11:13:48 +00:00
Merge pull request #27167 from euank/fix-node-e2e-coreos-kubelet-cgroup-detection
Automatic merge from submit-queue Fix node e2e coreos kubelet cgroup detection Fixes #26979 #26431 The root issue, as best I can tell, is that cgroup detection does not work when the kubelet is started under an ssh session and the systemd `*Accounting` variables are set. I added additional logging and noted some differences in the cgroup slice names between those cadvisor returns and the kubelet detects for itself. This difference does not occur if the kubelet is properly running under a unit. That environment is also a more common and sane environment. See also discussion in #26903 cc @derekwaynecarr @vishh @pwittrock
This commit is contained in:
commit
d9f788532f
@ -20,6 +20,7 @@ import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"math/rand"
|
||||
"net/http"
|
||||
"os"
|
||||
"os/exec"
|
||||
@ -37,16 +38,28 @@ var serverStartTimeout = flag.Duration("server-start-timeout", time.Second*120,
|
||||
var reportDir = flag.String("report-dir", "", "Path to the directory where the JUnit XML reports should be saved. Default is empty, which doesn't generate these reports.")
|
||||
|
||||
type e2eService struct {
|
||||
etcdCmd *exec.Cmd
|
||||
etcdCmd *killCmd
|
||||
etcdDataDir string
|
||||
apiServerCmd *exec.Cmd
|
||||
kubeletCmd *exec.Cmd
|
||||
apiServerCmd *killCmd
|
||||
kubeletCmd *killCmd
|
||||
kubeletStaticPodDir string
|
||||
nodeName string
|
||||
logFiles map[string]logFileData
|
||||
}
|
||||
|
||||
type logFileData struct {
|
||||
files []string
|
||||
journalctlCommand []string
|
||||
}
|
||||
|
||||
func newE2eService(nodeName string) *e2eService {
|
||||
return &e2eService{nodeName: nodeName}
|
||||
// Special log files that need to be collected for additional debugging.
|
||||
var logFiles = map[string]logFileData{
|
||||
"kern.log": {[]string{"/var/log/kern.log"}, []string{"-k"}},
|
||||
"docker.log": {[]string{"/var/log/docker.log", "/var/log/upstart/docker.log"}, []string{"-u", "docker"}},
|
||||
}
|
||||
|
||||
return &e2eService{nodeName: nodeName, logFiles: logFiles}
|
||||
}
|
||||
|
||||
func (es *e2eService) start() error {
|
||||
@ -81,22 +94,12 @@ func (es *e2eService) start() error {
|
||||
// Get logs of interest either via journalctl or by creating sym links.
|
||||
// Since we scp files from the remote directory, symlinks will be treated as normal files and file contents will be copied over.
|
||||
func (es *e2eService) getLogFiles() {
|
||||
// Special log files that need to be collected for additional debugging.
|
||||
type logFileData struct {
|
||||
files []string
|
||||
journalctlCommand []string
|
||||
}
|
||||
var logFiles = map[string]logFileData{
|
||||
"kern.log": {[]string{"/var/log/kern.log"}, []string{"-k"}},
|
||||
"docker.log": {[]string{"/var/log/docker.log", "/var/log/upstart/docker.log"}, []string{"-u", "docker"}},
|
||||
}
|
||||
|
||||
// Nothing to do if report dir is not specified.
|
||||
if *reportDir == "" {
|
||||
return
|
||||
}
|
||||
journaldFound := isJournaldAvailable()
|
||||
for targetFileName, logFileData := range logFiles {
|
||||
for targetFileName, logFileData := range es.logFiles {
|
||||
targetLink := path.Join(*reportDir, targetFileName)
|
||||
if journaldFound {
|
||||
// Skip log files that do not have an equivalent in journald based machines.
|
||||
@ -144,7 +147,7 @@ func isJournaldAvailable() bool {
|
||||
}
|
||||
|
||||
func (es *e2eService) stop() {
|
||||
if err := es.stopService("kubelet", es.kubeletCmd); err != nil {
|
||||
if err := es.stopService(es.kubeletCmd); err != nil {
|
||||
glog.Errorf("Failed to stop kubelet: %v", err)
|
||||
}
|
||||
if es.kubeletStaticPodDir != "" {
|
||||
@ -153,10 +156,10 @@ func (es *e2eService) stop() {
|
||||
glog.Errorf("Failed to delete kubelet static pod directory %s.\n%v", es.kubeletStaticPodDir, err)
|
||||
}
|
||||
}
|
||||
if err := es.stopService("kube-apiserver", es.apiServerCmd); err != nil {
|
||||
if err := es.stopService(es.apiServerCmd); err != nil {
|
||||
glog.Errorf("Failed to stop kube-apiserver: %v", err)
|
||||
}
|
||||
if err := es.stopService("etcd", es.etcdCmd); err != nil {
|
||||
if err := es.stopService(es.etcdCmd); err != nil {
|
||||
glog.Errorf("Failed to stop etcd: %v", err)
|
||||
}
|
||||
if es.etcdDataDir != "" {
|
||||
@ -167,7 +170,7 @@ func (es *e2eService) stop() {
|
||||
}
|
||||
}
|
||||
|
||||
func (es *e2eService) startEtcd() (*exec.Cmd, error) {
|
||||
func (es *e2eService) startEtcd() (*killCmd, error) {
|
||||
dataDir, err := ioutil.TempDir("", "node-e2e")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -181,10 +184,10 @@ func (es *e2eService) startEtcd() (*exec.Cmd, error) {
|
||||
"http://127.0.0.1:4001/v2/keys/", // Trailing slash is required,
|
||||
cmd,
|
||||
"etcd.log")
|
||||
return cmd, es.startServer(hcc)
|
||||
return &killCmd{name: "etcd", cmd: cmd}, es.startServer(hcc)
|
||||
}
|
||||
|
||||
func (es *e2eService) startApiServer() (*exec.Cmd, error) {
|
||||
func (es *e2eService) startApiServer() (*killCmd, error) {
|
||||
cmd := exec.Command("sudo", getApiServerBin(),
|
||||
"--etcd-servers", "http://127.0.0.1:4001",
|
||||
"--insecure-bind-address", "0.0.0.0",
|
||||
@ -197,16 +200,32 @@ func (es *e2eService) startApiServer() (*exec.Cmd, error) {
|
||||
"http://127.0.0.1:8080/healthz",
|
||||
cmd,
|
||||
"kube-apiserver.log")
|
||||
return cmd, es.startServer(hcc)
|
||||
return &killCmd{name: "kube-apiserver", cmd: cmd}, es.startServer(hcc)
|
||||
}
|
||||
|
||||
func (es *e2eService) startKubeletServer() (*exec.Cmd, error) {
|
||||
func (es *e2eService) startKubeletServer() (*killCmd, error) {
|
||||
dataDir, err := ioutil.TempDir("", "node-e2e-pod")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
es.kubeletStaticPodDir = dataDir
|
||||
cmd := exec.Command("sudo", getKubeletServerBin(),
|
||||
var killOverride *exec.Cmd
|
||||
cmdArgs := []string{}
|
||||
if systemdRun, err := exec.LookPath("systemd-run"); err == nil {
|
||||
// On systemd services, detection of a service / unit works reliably while
|
||||
// detection of a process started from an ssh session does not work.
|
||||
// Since kubelet will typically be run as a service it also makes more
|
||||
// sense to test it that way
|
||||
unitName := fmt.Sprintf("kubelet-%d.service", rand.Int31())
|
||||
cmdArgs = append(cmdArgs, systemdRun, "--unit="+unitName, getKubeletServerBin())
|
||||
killOverride = exec.Command("sudo", "systemctl", "kill", unitName)
|
||||
es.logFiles["kubelet.log"] = logFileData{
|
||||
journalctlCommand: []string{"-u", unitName},
|
||||
}
|
||||
} else {
|
||||
cmdArgs = append(cmdArgs, getKubeletServerBin())
|
||||
}
|
||||
cmdArgs = append(cmdArgs,
|
||||
"--api-servers", "http://127.0.0.1:8080",
|
||||
"--address", "0.0.0.0",
|
||||
"--port", "10250",
|
||||
@ -218,11 +237,12 @@ func (es *e2eService) startKubeletServer() (*exec.Cmd, error) {
|
||||
"--file-check-frequency", "10s", // Check file frequently so tests won't wait too long
|
||||
"--v", "8", "--logtostderr",
|
||||
)
|
||||
cmd := exec.Command("sudo", cmdArgs...)
|
||||
hcc := newHealthCheckCommand(
|
||||
"http://127.0.0.1:10255/healthz",
|
||||
cmd,
|
||||
"kubelet.log")
|
||||
return cmd, es.startServer(hcc)
|
||||
return &killCmd{name: "kubelet", cmd: cmd, override: killOverride}, es.startServer(hcc)
|
||||
}
|
||||
|
||||
func (es *e2eService) startServer(cmd *healthCheckCommand) error {
|
||||
@ -279,7 +299,27 @@ func (es *e2eService) startServer(cmd *healthCheckCommand) error {
|
||||
return fmt.Errorf("Timeout waiting for service %s", cmd)
|
||||
}
|
||||
|
||||
func (es *e2eService) stopService(name string, cmd *exec.Cmd) error {
|
||||
func (es *e2eService) stopService(cmd *killCmd) error {
|
||||
return cmd.Kill()
|
||||
}
|
||||
|
||||
// killCmd is a struct to kill a given cmd. The cmd member specifies a command
|
||||
// to find the pid of and attempt to kill.
|
||||
// If the override field is set, that will be used instead to kill the command.
|
||||
// name is only used for logging
|
||||
type killCmd struct {
|
||||
name string
|
||||
cmd *exec.Cmd
|
||||
override *exec.Cmd
|
||||
}
|
||||
|
||||
func (k *killCmd) Kill() error {
|
||||
if k.override != nil {
|
||||
return k.override.Run()
|
||||
}
|
||||
name := k.name
|
||||
cmd := k.cmd
|
||||
|
||||
if cmd == nil || cmd.Process == nil {
|
||||
glog.V(2).Infof("%s not running", name)
|
||||
return nil
|
||||
|
@ -41,6 +41,11 @@ EOF
|
||||
sudo systemctl daemon-reload
|
||||
fi
|
||||
|
||||
# For coreos, disable updates
|
||||
if $(sudo systemctl status update-engine &>/dev/null); then
|
||||
sudo systemctl mask update-engine locksmithd
|
||||
fi
|
||||
|
||||
# Fixup sudoers require tty
|
||||
sudo grep -q "# Defaults requiretty" /etc/sudoers
|
||||
if [ $? -ne 0 ] ; then
|
||||
|
@ -3,10 +3,7 @@ GCE_HOSTS=
|
||||
# To copy an image between projects:
|
||||
# `gcloud compute --project <to-project> disks create <image name> --image=https://www.googleapis.com/compute/v1/projects/<from-project>/global/images/<image-name>`
|
||||
# `gcloud compute --project <to-project> images create <image-name> --source-disk=<image-name>`
|
||||
#
|
||||
# Testing disabled on the following images:
|
||||
# e2e-node-coreos-stable20160531-image - Github Issue #26903
|
||||
GCE_IMAGES=e2e-node-ubuntu-trusty-docker10-image,e2e-node-ubuntu-trusty-docker9-image,e2e-node-ubuntu-trusty-docker8-image,e2e-node-containervm-v20160321-image
|
||||
GCE_IMAGES=e2e-node-ubuntu-trusty-docker10-image,e2e-node-ubuntu-trusty-docker9-image,e2e-node-ubuntu-trusty-docker8-image,e2e-node-coreos-stable20160531-image,e2e-node-containervm-v20160321-image
|
||||
GCE_ZONE=us-central1-f
|
||||
GCE_PROJECT=kubernetes-jenkins
|
||||
GCE_IMAGE_PROJECT=kubernetes-jenkins
|
||||
|
@ -3,10 +3,7 @@ GCE_HOSTS=
|
||||
# To copy an image between projects:
|
||||
# `gcloud compute --project <to-project> disks create <image name> --image=https://www.googleapis.com/compute/v1/projects/<from-project>/global/images/<image-name>`
|
||||
# `gcloud compute --project <to-project> images create <image-name> --source-disk=<image-name>`
|
||||
#
|
||||
# Testing disabled on the following images:
|
||||
# e2e-node-coreos-stable20160531-image - Github Issue #26903
|
||||
GCE_IMAGES=e2e-node-ubuntu-trusty-docker10-image,e2e-node-ubuntu-trusty-docker9-image,e2e-node-ubuntu-trusty-docker8-image,e2e-node-containervm-v20160321-image
|
||||
GCE_IMAGES=e2e-node-ubuntu-trusty-docker10-image,e2e-node-ubuntu-trusty-docker9-image,e2e-node-ubuntu-trusty-docker8-image,e2e-node-coreos-stable20160531-image,e2e-node-containervm-v20160321-image
|
||||
GCE_ZONE=us-central1-f
|
||||
GCE_PROJECT=kubernetes-jenkins-pull
|
||||
GCE_IMAGE_PROJECT=kubernetes-jenkins-pull
|
||||
|
Loading…
Reference in New Issue
Block a user