From e28251ec383b44a29cd9b32b0d6ee301c9753aed Mon Sep 17 00:00:00 2001 From: Sergey Kanzhelev Date: Wed, 7 Aug 2024 21:23:38 +0000 Subject: [PATCH] remove health-monitor service as it does more harm than good --- build/lib/release.sh | 1 - cluster/gce/gci/configure.sh | 2 - cluster/gce/gci/health-monitor.sh | 99 ------------------------------- cluster/gce/gci/master.yaml | 38 ------------ cluster/gce/gci/node.yaml | 40 ------------- 5 files changed, 180 deletions(-) delete mode 100644 cluster/gce/gci/health-monitor.sh diff --git a/build/lib/release.sh b/build/lib/release.sh index 34b226c27ee..1bf19257d53 100644 --- a/build/lib/release.sh +++ b/build/lib/release.sh @@ -410,7 +410,6 @@ function kube::release::package_kube_manifests_tarball() { if [[ -e "${KUBE_ROOT}/cluster/gce/gci/gke-internal-configure-helper.sh" ]]; then cp "${KUBE_ROOT}/cluster/gce/gci/gke-internal-configure-helper.sh" "${dst_dir}/" fi - cp "${KUBE_ROOT}/cluster/gce/gci/health-monitor.sh" "${dst_dir}/health-monitor.sh" # Merge GCE-specific addons with general purpose addons. for d in cluster/addons cluster/gce/addons; do find "${KUBE_ROOT}/${d}" \( \( -name \*.yaml -o -name \*.yaml.in -o -name \*.json \) -a ! \( -name \*demo\* \) \) -print0 | "${TAR}" c --transform "s|${KUBE_ROOT#/*}/${d}||" --null -T - | "${TAR}" x -C "${dst_dir}" diff --git a/cluster/gce/gci/configure.sh b/cluster/gce/gci/configure.sh index c8572ca6ce2..09ff3e6fd66 100644 --- a/cluster/gce/gci/configure.sh +++ b/cluster/gce/gci/configure.sh @@ -418,8 +418,6 @@ function install-kube-manifests { cp "${dst_dir}/kubernetes/gci-trusty/gke-internal-configure-helper.sh" "${KUBE_BIN}/" fi - cp "${dst_dir}/kubernetes/gci-trusty/health-monitor.sh" "${KUBE_BIN}/health-monitor.sh" - rm -f "${KUBE_HOME}/${manifests_tar}" rm -f "${KUBE_HOME}/${manifests_tar}.sha512" } diff --git a/cluster/gce/gci/health-monitor.sh b/cluster/gce/gci/health-monitor.sh deleted file mode 100644 index 4f4e6752944..00000000000 --- a/cluster/gce/gci/health-monitor.sh +++ /dev/null @@ -1,99 +0,0 @@ -#!/usr/bin/env bash - -# Copyright 2016 The Kubernetes Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This script is for master and node instance health monitoring, which is -# packed in kube-manifest tarball. It is executed through a systemd service -# in cluster/gce/gci/.yaml. The env variables come from an env -# file provided by the systemd service. - -set -o nounset -set -o pipefail - -# We simply kill the process when there is a failure. Another systemd service will -# automatically restart the process. -function container_runtime_monitoring { - local -r max_attempts=5 - local attempt=1 - local -r crictl="${KUBE_HOME}/bin/crictl" - local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-containerd}" - local -r healthcheck_command=("${crictl}" pods) - # Container runtime startup takes time. Make initial attempts before starting - # killing the container runtime. - until timeout 60 "${healthcheck_command[@]}" > /dev/null; do - if (( attempt == max_attempts )); then - echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness." - break - fi - echo "$attempt initial attempt \"${healthcheck_command[*]}\"! Trying again in $attempt seconds..." - sleep "$(( 2 ** attempt++ ))" - done - while true; do - if ! timeout 60 "${healthcheck_command[@]}" > /dev/null; then - echo "Container runtime ${container_runtime_name} failed!" - systemctl kill --kill-who=main "${container_runtime_name}" - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 120 - else - sleep "${SLEEP_SECONDS}" - fi - done -} - -function kubelet_monitoring { - echo "Wait for 2 minutes for kubelet to be functional" - # TODO(andyzheng0831): replace it with a more reliable method if possible. - sleep 120 - local -r max_seconds=10 - local output="" - while true; do - if ! output=$(curl -m "${max_seconds}" -f -s -S http://127.0.0.1:10248/healthz 2>&1); then - # Print the response and/or errors. - echo "${output}" - echo "Kubelet is unhealthy!" - systemctl kill kubelet - # Wait for a while, as we don't want to kill it again before it is really up. - sleep 60 - else - sleep "${SLEEP_SECONDS}" - fi - done -} - - -############## Main Function ################ -if [[ "$#" -ne 1 ]]; then - echo "Usage: health-monitor.sh " - exit 1 -fi - -KUBE_HOME="/home/kubernetes" -KUBE_ENV="${KUBE_HOME}/kube-env" -if [[ ! -e "${KUBE_ENV}" ]]; then - echo "The ${KUBE_ENV} file does not exist!! Terminate health monitoring" - exit 1 -fi - -SLEEP_SECONDS=10 -component=$1 -echo "Start kubernetes health monitoring for ${component}" -source "${KUBE_ENV}" -if [[ "${component}" == "container-runtime" ]]; then - container_runtime_monitoring -elif [[ "${component}" == "kubelet" ]]; then - kubelet_monitoring -else - echo "Health monitoring for component \"${component}\" is not supported!" -fi diff --git a/cluster/gce/gci/master.yaml b/cluster/gce/gci/master.yaml index 153296d5031..75cf541c98f 100644 --- a/cluster/gce/gci/master.yaml +++ b/cluster/gce/gci/master.yaml @@ -90,42 +90,6 @@ write_files: [Install] WantedBy=kubernetes.target - - path: /etc/systemd/system/kube-container-runtime-monitor.service - permissions: 0644 - owner: root - content: | - [Unit] - Description=Kubernetes health monitoring for container runtime - After=kube-master-configuration.service - - [Service] - Restart=always - RestartSec=10 - RemainAfterExit=yes - ExecStartPre=/bin/chmod 544 /home/kubernetes/bin/health-monitor.sh - ExecStart=/home/kubernetes/bin/health-monitor.sh container-runtime - - [Install] - WantedBy=kubernetes.target - - - path: /etc/systemd/system/kubelet-monitor.service - permissions: 0644 - owner: root - content: | - [Unit] - Description=Kubernetes health monitoring for kubelet - After=kube-master-configuration.service - - [Service] - Restart=always - RestartSec=10 - RemainAfterExit=yes - ExecStartPre=/bin/chmod 544 /home/kubernetes/bin/health-monitor.sh - ExecStart=/home/kubernetes/bin/health-monitor.sh kubelet - - [Install] - WantedBy=kubernetes.target - - path: /etc/systemd/system/kube-logrotate.timer permissions: 0644 owner: root @@ -170,8 +134,6 @@ runcmd: - systemctl enable kube-master-installation.service - systemctl enable kube-master-internal-route.service - systemctl enable kube-master-configuration.service - - systemctl enable kube-container-runtime-monitor.service - - systemctl enable kubelet-monitor.service - systemctl enable kube-logrotate.timer - systemctl enable kube-logrotate.service - systemctl enable kubernetes.target diff --git a/cluster/gce/gci/node.yaml b/cluster/gce/gci/node.yaml index 8b173dabbcf..3d03df211b7 100644 --- a/cluster/gce/gci/node.yaml +++ b/cluster/gce/gci/node.yaml @@ -41,44 +41,6 @@ write_files: [Install] WantedBy=kubernetes.target - - path: /etc/systemd/system/kube-container-runtime-monitor.service - permissions: 0644 - owner: root - content: | - [Unit] - Description=Kubernetes health monitoring for container runtime - After=kube-node-configuration.service - - [Service] - Restart=always - RestartSec=10 - RemainAfterExit=yes - RemainAfterExit=yes - ExecStartPre=/bin/chmod 544 /home/kubernetes/bin/health-monitor.sh - ExecStart=/home/kubernetes/bin/health-monitor.sh container-runtime - - [Install] - WantedBy=kubernetes.target - - - path: /etc/systemd/system/kubelet-monitor.service - permissions: 0644 - owner: root - content: | - [Unit] - Description=Kubernetes health monitoring for kubelet - After=kube-node-configuration.service - - [Service] - Restart=always - RestartSec=10 - RemainAfterExit=yes - RemainAfterExit=yes - ExecStartPre=/bin/chmod 544 /home/kubernetes/bin/health-monitor.sh - ExecStart=/home/kubernetes/bin/health-monitor.sh kubelet - - [Install] - WantedBy=kubernetes.target - - path: /etc/systemd/system/kube-logrotate.timer permissions: 0644 owner: root @@ -128,8 +90,6 @@ runcmd: - systemctl daemon-reload - systemctl enable kube-node-installation.service - systemctl enable kube-node-configuration.service - - systemctl enable kube-container-runtime-monitor.service - - systemctl enable kubelet-monitor.service - systemctl enable kube-logrotate.timer - systemctl enable kube-logrotate.service - systemctl enable kubernetes.target