diff --git a/cluster/log-dump.sh b/cluster/log-dump.sh new file mode 100755 index 00000000000..630bd3f4895 --- /dev/null +++ b/cluster/log-dump.sh @@ -0,0 +1,95 @@ +#!/bin/bash + +# Copyright 2016 The Kubernetes Authors All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Call this to dump all master and node logs into the folder specified in $1 +# (defaults to _artifacts). Only works if the provider supports SSH. + +set -o errexit +set -o nounset +set -o pipefail + +KUBE_ROOT=$(dirname "${BASH_SOURCE}")/.. +: ${KUBE_CONFIG_FILE:="config-test.sh"} + +source "${KUBE_ROOT}/cluster/kube-env.sh" +source "${KUBE_ROOT}/cluster/kube-util.sh" + +readonly report_dir="${1:-_artifacts}" +echo "Dumping master and node logs to ${report_dir}" + +# Saves a single output of running a given command ($2) on a given node ($1) +# into a given local file ($3). Does not fail if the ssh command fails for any +# reason, just prints an error to stderr. +function save-log() { + local -r node_name="${1}" + local -r cmd="${2}" + local -r output_file="${3}" + if ! ssh-to-node "${node_name}" "${cmd}" > "${output_file}"; then + echo "${cmd} failed for ${node_name}" >&2 + fi +} + +# Saves logs common to master and nodes. The node name is in $1 and the +# directory/name prefix is in $2. Assumes KUBERNETES_PROVIDER is set. +function save-common-logs() { + local -r node_name="${1}" + local -r prefix="${2}" + echo "Dumping logs for ${node_name}" + save-log "${node_name}" "cat /var/log/kern.log" "${prefix}-kern.log" + save-log "${node_name}" "cat /var/log/docker.log" "${prefix}-docker.log" + if [[ "${KUBERNETES_PROVIDER}" == "gce" ]]; then + save-log "${node_name}" "cat /var/log/startupscript.log" "${prefix}-startupscript.log" + fi + if ssh-to-node "${node_name}" "sudo systemctl status kubelet.service" &> /dev/null; then + save-log "${node_name}" "sudo journalctl --output=cat -u kubelet.service" "${prefix}-kubelet.log" + else + save-log "${node_name}" "cat /var/log/kubelet.log" "${prefix}-kubelet.log" + save-log "${node_name}" "cat /var/log/supervisor/supervisord.log" "${prefix}-supervisord.log" + save-log "${node_name}" "cat /var/log/supervisor/kubelet-stdout.log" "${prefix}-supervisord-kubelet-stdout.log" + save-log "${node_name}" "cat /var/log/supervisor/kubelet-stderr.log" "${prefix}-supervisord-kubelet-stderr.log" + fi +} + +readonly master_ssh_supported_providers="gce aws kubemark" +readonly node_ssh_supported_providers="gce gke aws" + +if [[ ! "${master_ssh_supported_providers}" =~ "${KUBERNETES_PROVIDER}" ]]; then + echo "Master SSH not supported for ${KUBERNETES_PROVIDER}" +elif ! $(detect-master &> /dev/null); then + echo "Master not detected. Is the cluster up?" +else + echo "Master Name: ${MASTER_NAME}" + readonly master_prefix="${report_dir}/${MASTER_NAME}" + save-log "${MASTER_NAME}" "cat /var/log/kube-apiserver.log" "${master_prefix}-kube-apiserver.log" + save-log "${MASTER_NAME}" "cat /var/log/kube-scheduler.log" "${master_prefix}-kube-scheduler.log" + save-log "${MASTER_NAME}" "cat /var/log/kube-controller-manager.log" "${master_prefix}-kube-controller-manager.log" + save-log "${MASTER_NAME}" "cat /var/log/etcd.log" "${master_prefix}-kube-etcd.log" + save-common-logs "${MASTER_NAME}" "${master_prefix}" +fi + +detect-node-names &> /dev/null +if [[ ! "${node_ssh_supported_providers}" =~ "${KUBERNETES_PROVIDER}" ]]; then + echo "Node SSH not supported for ${KUBERNETES_PROVIDER}" +elif [[ "${#NODE_NAMES[@]}" -eq 0 ]]; then + echo "Nodes not detected. Is the cluster up?" +else + echo "Node Names: ${NODE_NAMES[*]}" + for node_name in "${NODE_NAMES[@]}"; do + node_prefix="${report_dir}/${node_name}" + save-log "${node_name}" "cat /var/log/kube-proxy.log" "${node_prefix}-kube-proxy.log" + save-common-logs "${node_name}" "${node_prefix}" + done +fi diff --git a/hack/jenkins/e2e-runner.sh b/hack/jenkins/e2e-runner.sh index 5834451df0b..997ea60c0a5 100755 --- a/hack/jenkins/e2e-runner.sh +++ b/hack/jenkins/e2e-runner.sh @@ -193,7 +193,15 @@ if [[ "${gcp_list_resources}" == "true" ]]; then ${gcp_list_resources_script} > "${gcp_resources_before}" fi if [[ "${E2E_UP,,}" == "true" ]]; then - go run ./hack/e2e.go ${E2E_OPT:-} -v --up + # We want to try to gather logs even if kube-up fails, so collect the + # result here and fail after dumping logs if it's nonzero. + go run ./hack/e2e.go ${E2E_OPT:-} -v --up && up_result="$?" || up_result="$?" + if [[ "${up_result}" -ne 0 ]]; then + if [[ -x "cluster/log-dump.sh" ]]; then + ./cluster/log-dump.sh "${ARTIFACTS}" + fi + exit "${up_result}" + fi go run ./hack/e2e.go -v --ctl="version --match-server-version=false" if [[ "${gcp_list_resources}" == "true" ]]; then ${gcp_list_resources_script} > "${gcp_resources_cluster_up}" diff --git a/hack/jenkins/job-configs/global.yaml b/hack/jenkins/job-configs/global.yaml index 179f50213a0..9fc4f226fb9 100644 --- a/hack/jenkins/job-configs/global.yaml +++ b/hack/jenkins/job-configs/global.yaml @@ -11,9 +11,9 @@ - publisher: name: gcs-uploader publishers: - # Use our script for build artifacts, since it's more flexible. - postbuildscript: builders: + # Use our script for build artifacts, since it's more flexible. - shell: | if [[ -x ./hack/jenkins/upload-to-gcs.sh ]]; then ./hack/jenkins/upload-to-gcs.sh diff --git a/hack/verify-flags/exceptions.txt b/hack/verify-flags/exceptions.txt index d7e41963144..421b1e258c6 100644 --- a/hack/verify-flags/exceptions.txt +++ b/hack/verify-flags/exceptions.txt @@ -5,7 +5,6 @@ cluster/aws/templates/configure-vm-aws.sh: # We set the hostname_override to th cluster/aws/templates/configure-vm-aws.sh: api_servers: '${API_SERVERS}' cluster/aws/templates/configure-vm-aws.sh: env-to-grains "hostname_override" cluster/aws/templates/configure-vm-aws.sh: env-to-grains "runtime_config" -cluster/aws/templates/salt-minion.sh:# We set the hostname_override to the full EC2 private dns name cluster/centos/util.sh: local node_ip=${node#*@} cluster/gce/configure-vm.sh: advertise_address: '${EXTERNAL_IP}' cluster/gce/configure-vm.sh: api_servers: '${KUBERNETES_MASTER_NAME}' @@ -50,6 +49,10 @@ cluster/juju/charms/trusty/kubernetes/hooks/network-relation-changed: for k i cluster/juju/charms/trusty/kubernetes/hooks/network-relation-changed: if api_servers: cluster/lib/logging.sh: local source_file=${BASH_SOURCE[$frame_no]} cluster/lib/logging.sh: local source_file=${BASH_SOURCE[$stack_skip]} +cluster/log-dump.sh: for node_name in "${NODE_NAMES[@]}"; do +cluster/log-dump.sh: local -r node_name="${1}" +cluster/log-dump.sh: local -r node_name="${1}" +cluster/log-dump.sh:readonly report_dir="${1:-_artifacts}" cluster/mesos/docker/km/build.sh: km_path=$(find-binary km darwin/amd64) cluster/rackspace/util.sh: local node_ip=$(nova show --minimal ${NODE_NAMES[$i]} \ cluster/saltbase/salt/kube-addons/kube-addons.sh:# Create admission_control objects if defined before any other addon services. If the limits @@ -81,7 +84,6 @@ docs/getting-started-guides/coreos/azure/lib/deployment_logic/kubernetes.js: re docs/getting-started-guides/coreos/azure/lib/deployment_logic/kubernetes.js: return cloud_config.process_template(input_file, output_file, function(data) { docs/getting-started-guides/coreos/azure/lib/deployment_logic/kubernetes.js: var write_files_extra = cloud_config.write_files_from('addons', '/etc/kubernetes/addons'); docs/getting-started-guides/coreos/azure/lib/deployment_logic/kubernetes.js:var cloud_config = require('../cloud_config.js'); -docs/getting-started-guides/docker-multinode/skydns-rc.yaml.in: - -kube_master_url=http://{kube_server_url}:8080 examples/cluster-dns/images/frontend/client.py: service_address = socket.gethostbyname(hostname) examples/vitess/env.sh: node_ip=$(get_node_ip) hack/jenkins/job-builder-image/Dockerfile:# JJB configuration lives in /etc/jenkins_jobs/jenkins_jobs.ini @@ -91,11 +93,10 @@ hack/jenkins/update-jobs.sh: # jenkins_jobs.ini contains administrative credent hack/jenkins/update-jobs.sh: if [[ -e jenkins_jobs.ini ]]; then hack/local-up-cluster.sh: runtime_config="--runtime-config=${RUNTIME_CONFIG}" hack/local-up-cluster.sh: runtime_config="" +pkg/kubelet/network/hairpin/hairpin.go: hairpinModeRelativePath = "hairpin_mode" pkg/kubelet/qos/memory_policy_test.go: t.Errorf("oom_score_adj should be between %d and %d, but was %d", test.lowOOMScoreAdj, test.highOOMScoreAdj, oomScoreAdj) pkg/kubelet/qos/memory_policy_test.go: highOOMScoreAdj int // The min oom_score_adj score the container should be assigned. pkg/kubelet/qos/memory_policy_test.go: lowOOMScoreAdj int // The max oom_score_adj score the container should be assigned. -pkg/util/oom/oom_linux.go: err = fmt.Errorf("failed to read oom_score_adj: %v", readErr) -pkg/util/oom/oom_linux.go: err = fmt.Errorf("failed to set oom_score_adj to %d: %v", oomScoreAdj, writeErr) pkg/util/oom/oom_linux.go: return fmt.Errorf("invalid PID %d specified for oom_score_adj", pid) pkg/util/oom/oom_linux.go: oomScoreAdjPath := path.Join("/proc", pidStr, "oom_score_adj") pkg/util/oom/oom_linux.go:// Writes 'value' to /proc//oom_score_adj for all processes in cgroup cgroupName. @@ -110,4 +111,3 @@ test/e2e/host_path.go: fmt.Sprintf("--retry_time=%d", retryDuration), test/images/mount-tester/mt.go: flag.BoolVar(&breakOnExpectedContent, "break_on_expected_content", true, "Break out of loop on expected content, (use with --file_content_in_loop flag only)") test/images/mount-tester/mt.go: flag.IntVar(&retryDuration, "retry_time", 180, "Retry time during the loop") test/images/mount-tester/mt.go: flag.StringVar(&readFileContentInLoopPath, "file_content_in_loop", "", "Path to read the file content in loop from") -pkg/kubelet/network/hairpin/hairpin.go: hairpinModeRelativePath = "hairpin_mode"