From d94a2b39d9b638899306c823bbcca6497102f1c2 Mon Sep 17 00:00:00 2001 From: Lantao Liu Date: Wed, 2 May 2018 01:41:55 -0700 Subject: [PATCH 1/3] Install and use crictl in gce kube-up.sh Signed-off-by: Lantao Liu --- cluster/gce/gci/configure-helper.sh | 19 ++++++---- cluster/gce/gci/configure.sh | 33 +++++++++++++++++ cluster/gce/gci/health-monitor.sh | 35 +++++++++++++------ cluster/gce/gci/master.yaml | 8 ++--- cluster/gce/gci/node.yaml | 8 ++--- .../gce/manifests/e2e-image-puller.manifest | 20 +++++++---- cluster/gce/util.sh | 1 - 7 files changed, 92 insertions(+), 32 deletions(-) diff --git a/cluster/gce/gci/configure-helper.sh b/cluster/gce/gci/configure-helper.sh index ad12bc637f1..be561cab636 100644 --- a/cluster/gce/gci/configure-helper.sh +++ b/cluster/gce/gci/configure-helper.sh @@ -2089,10 +2089,16 @@ function start-fluentd-resource-update { wait-for-apiserver-and-update-fluentd & } -# Update {{ container-runtime }} with actual container runtime name. +# Update {{ container-runtime }} with actual container runtime name, +# and {{ container-runtime-endpoint }} with actual container runtime +# endpoint. function update-container-runtime { - local -r configmap_yaml="$1" - sed -i -e "s@{{ *container_runtime *}}@${CONTAINER_RUNTIME_NAME:-docker}@g" "${configmap_yaml}" + local -r file="$1" + local -r container_runtime_endpoint="${CONTAINER_RUNTIME_ENDPOINT:-unix:///var/run/dockershim.sock}" + sed -i \ + -e "s@{{ *container_runtime *}}@${CONTAINER_RUNTIME_NAME:-docker}@g" \ + -e "s@{{ *container_runtime_endpoint *}}@${container_runtime_endpoint#unix://}@g" \ + "${file}" } # Remove configuration in yaml file if node journal is not enabled. @@ -2375,8 +2381,9 @@ EOF # Starts an image-puller - used in test clusters. function start-image-puller { echo "Start image-puller" - cp "${KUBE_HOME}/kube-manifests/kubernetes/gci-trusty/e2e-image-puller.manifest" \ - /etc/kubernetes/manifests/ + local -r e2e_image_puller_manifest="${KUBE_HOME}/kube-manifests/kubernetes/gci-trusty/e2e-image-puller.manifest" + update-container-runtime "${e2e_image_puller_manifest}" + cp "${e2e_image_puller_manifest}" /etc/kubernetes/manifests/ } # Setups manifests for ingress controller and gce-specific policies for service controller. @@ -2590,4 +2597,4 @@ if [[ "$#" -eq 1 && "${1}" == "--source-only" ]]; then : else main "${@}" -fi \ No newline at end of file +fi diff --git a/cluster/gce/gci/configure.sh b/cluster/gce/gci/configure.sh index 14cc0a1e93e..ef40ad63d5f 100644 --- a/cluster/gce/gci/configure.sh +++ b/cluster/gce/gci/configure.sh @@ -28,6 +28,8 @@ DEFAULT_CNI_VERSION="v0.6.0" DEFAULT_CNI_SHA1="d595d3ded6499a64e8dac02466e2f5f2ce257c9f" DEFAULT_NPD_VERSION="v0.4.1" DEFAULT_NPD_SHA1="a57a3fe64cab8a18ec654f5cef0aec59dae62568" +DEFAULT_CRICTL_VERSION="v1.0.0-beta.1" +DEFAULT_CRICTL_SHA1="6816982ea1b83506945ce02949199171fee17b0b" DEFAULT_MOUNTER_TAR_SHA="8003b798cf33c7f91320cd6ee5cec4fa22244571" ### @@ -234,6 +236,34 @@ function install-cni-binaries { rm -f "${KUBE_HOME}/${cni_tar}" } +# Install crictl binary. +function install-crictl { + if [[ -n "${CRICTL_VERSION:-}" ]]; then + local -r crictl_version="${CRICTL_VERSION}" + local -r crictl_sha1="${CRICTL_TAR_HASH}" + else + local -r crictl_version="${DEFAULT_CRICTL_VERSION}" + local -r crictl_sha1="${DEFAULT_CRICTL_SHA1}" + fi + local -r crictl="crictl-${crictl_version}-linux-amd64" + + if is-preloaded "${crictl}" "${crictl_sha1}"; then + echo "crictl is preloaded" + return + fi + + echo "Downloading crictl" + local -r crictl_path="https://storage.googleapis.com/kubernetes-release/crictl" + download-or-bust "${crictl_sha1}" "${crictl_path}/${crictl}" + mv "${KUBE_HOME}/${crictl}" "${KUBE_BIN}/crictl" + chmod a+x "${KUBE_BIN}/crictl" + + # Create crictl config file. + cat > /etc/crictl.yaml < /dev/null; then - echo "Docker daemon failed!" - pkill docker +function container_runtime_monitoring { + # Container runtime startup takes time. Make initial attempts before starting + # killing the container runtime. + local -r max_attempts=5 + local attempt=1 + local -r crictl="${KUBE_HOME}/bin/crictl" + local -r container_runtime="${CONTAINER_RUNTIME_NAME:-docker}" + until timeout 60 "${crictl}" pods > /dev/null; do + if (( attempt == max_attempts )); then + echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." + break + fi + echo "$attempt initial attempt \"${crictl} pods\"! Trying again in $attempt seconds..." + sleep "$(( attempt++ ))" + done + while true; do + if ! timeout 60 "${crictl}" pods > /dev/null; then + echo "Container runtime ${container_runtime} failed!" + systemctl kill --kill-who=main "${container_runtime}" # Wait for a while, as we don't want to kill it again before it is really up. sleep 120 else @@ -48,7 +62,7 @@ function kubelet_monitoring { # Print the response and/or errors. echo $output echo "Kubelet is unhealthy!" - pkill kubelet + systemctl kill kubelet # Wait for a while, as we don't want to kill it again before it is really up. sleep 60 else @@ -60,11 +74,12 @@ function kubelet_monitoring { ############## Main Function ################ if [[ "$#" -ne 1 ]]; then - echo "Usage: health-monitor.sh " + echo "Usage: health-monitor.sh " exit 1 fi -KUBE_ENV="/home/kubernetes/kube-env" +KUBE_HOME="/home/kubernetes" +KUBE_ENV="${KUBE_HOME}/kube-env" if [[ ! -e "${KUBE_ENV}" ]]; then echo "The ${KUBE_ENV} file does not exist!! Terminate health monitoring" exit 1 @@ -74,8 +89,8 @@ SLEEP_SECONDS=10 component=$1 echo "Start kubernetes health monitoring for ${component}" source "${KUBE_ENV}" -if [[ "${component}" == "docker" ]]; then - docker_monitoring +if [[ "${component}" == "container-runtime" ]]; then + container_runtime_monitoring elif [[ "${component}" == "kubelet" ]]; then kubelet_monitoring else diff --git a/cluster/gce/gci/master.yaml b/cluster/gce/gci/master.yaml index 6d7d8e8e1a4..7a74113afab 100644 --- a/cluster/gce/gci/master.yaml +++ b/cluster/gce/gci/master.yaml @@ -40,12 +40,12 @@ write_files: [Install] WantedBy=kubernetes.target - - path: /etc/systemd/system/kube-docker-monitor.service + - path: /etc/systemd/system/kube-container-runtime-monitor.service permissions: 0644 owner: root content: | [Unit] - Description=Kubernetes health monitoring for docker + Description=Kubernetes health monitoring for container runtime After=kube-master-configuration.service [Service] @@ -54,7 +54,7 @@ write_files: RemainAfterExit=yes RemainAfterExit=yes ExecStartPre=/bin/chmod 544 /home/kubernetes/bin/health-monitor.sh - ExecStart=/home/kubernetes/bin/health-monitor.sh docker + ExecStart=/home/kubernetes/bin/health-monitor.sh container-runtime [Install] WantedBy=kubernetes.target @@ -120,7 +120,7 @@ runcmd: - systemctl daemon-reload - systemctl enable kube-master-installation.service - systemctl enable kube-master-configuration.service - - systemctl enable kube-docker-monitor.service + - systemctl enable kube-container-runtime-monitor.service - systemctl enable kubelet-monitor.service - systemctl enable kube-logrotate.timer - systemctl enable kube-logrotate.service diff --git a/cluster/gce/gci/node.yaml b/cluster/gce/gci/node.yaml index e04018e3453..fe73c05bdbe 100644 --- a/cluster/gce/gci/node.yaml +++ b/cluster/gce/gci/node.yaml @@ -40,12 +40,12 @@ write_files: [Install] WantedBy=kubernetes.target - - path: /etc/systemd/system/kube-docker-monitor.service + - path: /etc/systemd/system/kube-container-runtime-monitor.service permissions: 0644 owner: root content: | [Unit] - Description=Kubernetes health monitoring for docker + Description=Kubernetes health monitoring for container runtime After=kube-node-configuration.service [Service] @@ -54,7 +54,7 @@ write_files: RemainAfterExit=yes RemainAfterExit=yes ExecStartPre=/bin/chmod 544 /home/kubernetes/bin/health-monitor.sh - ExecStart=/home/kubernetes/bin/health-monitor.sh docker + ExecStart=/home/kubernetes/bin/health-monitor.sh container-runtime [Install] WantedBy=kubernetes.target @@ -120,7 +120,7 @@ runcmd: - systemctl daemon-reload - systemctl enable kube-node-installation.service - systemctl enable kube-node-configuration.service - - systemctl enable kube-docker-monitor.service + - systemctl enable kube-container-runtime-monitor.service - systemctl enable kubelet-monitor.service - systemctl enable kube-logrotate.timer - systemctl enable kube-logrotate.service diff --git a/cluster/gce/manifests/e2e-image-puller.manifest b/cluster/gce/manifests/e2e-image-puller.manifest index aba8eef6de7..6d3ba6fb898 100644 --- a/cluster/gce/manifests/e2e-image-puller.manifest +++ b/cluster/gce/manifests/e2e-image-puller.manifest @@ -76,14 +76,16 @@ spec: gcr.io/kubernetes-e2e-test-images/volume-rbd:0.1 k8s.gcr.io/zookeeper-install-3.5.0-alpha:e2e gcr.io/google_samples/gb-redisslave:nonexistent - ; do echo $(date '+%X') pulling $i; docker pull $i 1>/dev/null; done; exit 0; + ; do echo $(date '+%X') pulling $i; crictl pull $i 1>/dev/null; done; exit 0; securityContext: privileged: true volumeMounts: - - mountPath: /var/run/docker.sock + - mountPath: {{ container_runtime_endpoint }} name: socket - - mountPath: /usr/bin/docker - name: docker + - mountPath: /usr/bin/crictl + name: crictl + - mountPath: /etc/crictl.yaml + name: config # Add a container that runs a health-check - name: nethealth-check resources: @@ -98,13 +100,17 @@ spec: - "/usr/bin/nethealth || true" volumes: - hostPath: - path: /var/run/docker.sock + path: {{ container_runtime_endpoint }} type: Socket name: socket - hostPath: - path: /usr/bin/docker + path: /home/kubernetes/bin/crictl type: File - name: docker + name: crictl + - hostPath: + path: /etc/crictl.yaml + type: File + name: config # This pod is really fire-and-forget. restartPolicy: OnFailure # This pod needs hostNetworking for true VM perf measurement as well as avoiding cbr0 issues diff --git a/cluster/gce/util.sh b/cluster/gce/util.sh index 60960e84720..c2f97be6ece 100755 --- a/cluster/gce/util.sh +++ b/cluster/gce/util.sh @@ -673,7 +673,6 @@ function construct-kubelet-flags { if [[ -n "${CONTAINER_RUNTIME:-}" ]]; then flags+=" --container-runtime=${CONTAINER_RUNTIME}" fi - # TODO(mtaufen): CONTAINER_RUNTIME_ENDPOINT seems unused; delete it? if [[ -n "${CONTAINER_RUNTIME_ENDPOINT:-}" ]]; then flags+=" --container-runtime-endpoint=${CONTAINER_RUNTIME_ENDPOINT}" fi From 884e08e33c007b7166c5c7e693a96c6a94a853b1 Mon Sep 17 00:00:00 2001 From: Lantao Liu Date: Wed, 2 May 2018 15:51:07 -0700 Subject: [PATCH 2/3] Collect logs for health monitor services. Signed-off-by: Lantao Liu --- cluster/log-dump/log-dump.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cluster/log-dump/log-dump.sh b/cluster/log-dump/log-dump.sh index f61c6de34dc..c1f1d9feb8b 100755 --- a/cluster/log-dump/log-dump.sh +++ b/cluster/log-dump/log-dump.sh @@ -50,7 +50,7 @@ readonly gce_logfiles="startupscript" readonly kern_logfile="kern" readonly initd_logfiles="docker" readonly supervisord_logfiles="kubelet supervisor/supervisord supervisor/kubelet-stdout supervisor/kubelet-stderr supervisor/docker-stdout supervisor/docker-stderr" -readonly systemd_services="kubelet ${LOG_DUMP_SYSTEMD_SERVICES:-docker}" +readonly systemd_services="kubelet kubelet-monitor kube-container-runtime-monitor ${LOG_DUMP_SYSTEMD_SERVICES:-docker}" # Limit the number of concurrent node connections so that we don't run out of # file descriptors for large clusters. From f952b093a7faa62323617a684e7f6a58d863f42f Mon Sep 17 00:00:00 2001 From: Lantao Liu Date: Thu, 3 May 2018 17:18:21 -0700 Subject: [PATCH 3/3] Still use `docker ps` for docker health monitoring. Signed-off-by: Lantao Liu --- cluster/gce/gci/health-monitor.sh | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/cluster/gce/gci/health-monitor.sh b/cluster/gce/gci/health-monitor.sh index f342fc1ec43..f5517cf6451 100644 --- a/cluster/gce/gci/health-monitor.sh +++ b/cluster/gce/gci/health-monitor.sh @@ -25,24 +25,32 @@ set -o pipefail # We simply kill the process when there is a failure. Another systemd service will # automatically restart the process. function container_runtime_monitoring { - # Container runtime startup takes time. Make initial attempts before starting - # killing the container runtime. local -r max_attempts=5 local attempt=1 local -r crictl="${KUBE_HOME}/bin/crictl" - local -r container_runtime="${CONTAINER_RUNTIME_NAME:-docker}" - until timeout 60 "${crictl}" pods > /dev/null; do + local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}" + # We still need to use `docker ps` when container runtime is "docker". This is because + # dockershim is still part of kubelet today. When kubelet is down, crictl pods + # will also fail, and docker will be killed. This is undesirable especially when + # docker live restore is disabled. + local healthcheck_command="docker ps" + if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then + healthcheck_command="${crictl} pods" + fi + # Container runtime startup takes time. Make initial attempts before starting + # killing the container runtime. + until timeout 60 ${healthcheck_command} > /dev/null; do if (( attempt == max_attempts )); then echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." break fi - echo "$attempt initial attempt \"${crictl} pods\"! Trying again in $attempt seconds..." - sleep "$(( attempt++ ))" + echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..." + sleep "$(( 2 ** attempt++ ))" done while true; do - if ! timeout 60 "${crictl}" pods > /dev/null; then - echo "Container runtime ${container_runtime} failed!" - systemctl kill --kill-who=main "${container_runtime}" + if ! timeout 60 ${healthcheck_command} > /dev/null; then + echo "Container runtime ${container_runtime_name} failed!" + systemctl kill --kill-who=main "${container_runtime_name}" # Wait for a while, as we don't want to kill it again before it is really up. sleep 120 else