mirror of
https://github.com/k3s-io/kubernetes.git
synced 2025-07-22 11:21:47 +00:00
Merge pull request #63357 from Random-Liu/install-and-use-crictl
Automatic merge from submit-queue (batch tested with PRs 63167, 63357). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Install and use crictl in gce kube-up.sh Download and use crictl in gce kube-up.sh. This PR: 1. Downloads crictl `v1.0.0-beta.0` onto the node, which supports CRI v1alpha2. We'll upgrade it to `v1.0.0-beta.1` soon after the release is cut. 2. Change `kube-docker-monitor` to `kube-container-runtime-monitor`, and let it use `crictl` to do health monitoring. 3. Change `e2e-image-puller` to use `crictl`. Because of https://github.com/kubernetes/kubernetes/issues/63355, it doesn't work now. But in `crictl v1.0.0-beta.1`, we are going to statically link it, and the `e2e-image-puller` should work again. 4. Use `systemctl kill --kill-who=main` instead of `pkill`, the reason is that: a. `pkill docker` will send `SIGTERM` to all processes including `dockerd`, `docker-containerd`, `docker-containerd-shim`. This is not a problem for Docker 17.03 CE, because `containerd-shim` in containerd 0.2.x doesn't exit with SIGERM (see [code](https://github.com/containerd/containerd/blob/v0.2.x/containerd-shim/main.go#L123)). However, `containerd-shim` in containerd 1.0+ does exit with SIGTERM (see [code](https://github.com/containerd/containerd/blob/master/cmd/containerd-shim/main_unix.go#L200)). This means that `pkill docker` and `pkill containerd` will kill all shim processes for Docker 17.11+ and containerd 1.0+. b. We can use `pkill -x` instead. However, docker systemd service name is `docker`, but daemon process name is `dockerd`. We have to introduce another environment variable to specify "daemon process name". Given so, it seems easier to just use `systemctl kill` which only requires systemd service name. `systemctl kill --kill-who=main` will make sure only main process receives SIGTERM. Signed-off-by: Lantao Liu <lantaol@google.com> /cc @filbranden @yujuhong @feiskyer @mrunalp @kubernetes/sig-node-pr-reviews @kubernetes/sig-cluster-lifecycle-pr-reviews **Release note**: ```release-note Kubernetes cluster on GCE have crictl installed now. Users can use it to help debug their node. The documentation of crictl can be found https://github.com/kubernetes-incubator/cri-tools/blob/master/docs/crictl.md. ```
This commit is contained in:
commit
7b8bb6e7d3
@ -2114,10 +2114,16 @@ function start-fluentd-resource-update {
|
||||
wait-for-apiserver-and-update-fluentd &
|
||||
}
|
||||
|
||||
# Update {{ container-runtime }} with actual container runtime name.
|
||||
# Update {{ container-runtime }} with actual container runtime name,
|
||||
# and {{ container-runtime-endpoint }} with actual container runtime
|
||||
# endpoint.
|
||||
function update-container-runtime {
|
||||
local -r configmap_yaml="$1"
|
||||
sed -i -e "s@{{ *container_runtime *}}@${CONTAINER_RUNTIME_NAME:-docker}@g" "${configmap_yaml}"
|
||||
local -r file="$1"
|
||||
local -r container_runtime_endpoint="${CONTAINER_RUNTIME_ENDPOINT:-unix:///var/run/dockershim.sock}"
|
||||
sed -i \
|
||||
-e "s@{{ *container_runtime *}}@${CONTAINER_RUNTIME_NAME:-docker}@g" \
|
||||
-e "s@{{ *container_runtime_endpoint *}}@${container_runtime_endpoint#unix://}@g" \
|
||||
"${file}"
|
||||
}
|
||||
|
||||
# Remove configuration in yaml file if node journal is not enabled.
|
||||
@ -2399,8 +2405,9 @@ EOF
|
||||
# Starts an image-puller - used in test clusters.
|
||||
function start-image-puller {
|
||||
echo "Start image-puller"
|
||||
cp "${KUBE_HOME}/kube-manifests/kubernetes/gci-trusty/e2e-image-puller.manifest" \
|
||||
/etc/kubernetes/manifests/
|
||||
local -r e2e_image_puller_manifest="${KUBE_HOME}/kube-manifests/kubernetes/gci-trusty/e2e-image-puller.manifest"
|
||||
update-container-runtime "${e2e_image_puller_manifest}"
|
||||
cp "${e2e_image_puller_manifest}" /etc/kubernetes/manifests/
|
||||
}
|
||||
|
||||
# Setups manifests for ingress controller and gce-specific policies for service controller.
|
||||
|
@ -28,6 +28,8 @@ DEFAULT_CNI_VERSION="v0.6.0"
|
||||
DEFAULT_CNI_SHA1="d595d3ded6499a64e8dac02466e2f5f2ce257c9f"
|
||||
DEFAULT_NPD_VERSION="v0.4.1"
|
||||
DEFAULT_NPD_SHA1="a57a3fe64cab8a18ec654f5cef0aec59dae62568"
|
||||
DEFAULT_CRICTL_VERSION="v1.0.0-beta.1"
|
||||
DEFAULT_CRICTL_SHA1="6816982ea1b83506945ce02949199171fee17b0b"
|
||||
DEFAULT_MOUNTER_TAR_SHA="8003b798cf33c7f91320cd6ee5cec4fa22244571"
|
||||
###
|
||||
|
||||
@ -234,6 +236,34 @@ function install-cni-binaries {
|
||||
rm -f "${KUBE_HOME}/${cni_tar}"
|
||||
}
|
||||
|
||||
# Install crictl binary.
|
||||
function install-crictl {
|
||||
if [[ -n "${CRICTL_VERSION:-}" ]]; then
|
||||
local -r crictl_version="${CRICTL_VERSION}"
|
||||
local -r crictl_sha1="${CRICTL_TAR_HASH}"
|
||||
else
|
||||
local -r crictl_version="${DEFAULT_CRICTL_VERSION}"
|
||||
local -r crictl_sha1="${DEFAULT_CRICTL_SHA1}"
|
||||
fi
|
||||
local -r crictl="crictl-${crictl_version}-linux-amd64"
|
||||
|
||||
if is-preloaded "${crictl}" "${crictl_sha1}"; then
|
||||
echo "crictl is preloaded"
|
||||
return
|
||||
fi
|
||||
|
||||
echo "Downloading crictl"
|
||||
local -r crictl_path="https://storage.googleapis.com/kubernetes-release/crictl"
|
||||
download-or-bust "${crictl_sha1}" "${crictl_path}/${crictl}"
|
||||
mv "${KUBE_HOME}/${crictl}" "${KUBE_BIN}/crictl"
|
||||
chmod a+x "${KUBE_BIN}/crictl"
|
||||
|
||||
# Create crictl config file.
|
||||
cat > /etc/crictl.yaml <<EOF
|
||||
runtime-endpoint: ${CONTAINER_RUNTIME_ENDPOINT:-unix:///var/run/dockershim.sock}
|
||||
EOF
|
||||
}
|
||||
|
||||
function install-kube-manifests {
|
||||
# Put kube-system pods manifests in ${KUBE_HOME}/kube-manifests/.
|
||||
local dst_dir="${KUBE_HOME}/kube-manifests"
|
||||
@ -370,6 +400,9 @@ function install-kube-binary-config {
|
||||
remount-flexvolume-directory "${VOLUME_PLUGIN_DIR}"
|
||||
fi
|
||||
|
||||
# Install crictl on each node.
|
||||
install-crictl
|
||||
|
||||
# Clean up.
|
||||
rm -rf "${KUBE_HOME}/kubernetes"
|
||||
rm -f "${KUBE_HOME}/${server_binary_tar}"
|
||||
|
@ -24,11 +24,33 @@ set -o pipefail
|
||||
|
||||
# We simply kill the process when there is a failure. Another systemd service will
|
||||
# automatically restart the process.
|
||||
function docker_monitoring {
|
||||
while [ 1 ]; do
|
||||
if ! timeout 60 docker ps > /dev/null; then
|
||||
echo "Docker daemon failed!"
|
||||
pkill docker
|
||||
function container_runtime_monitoring {
|
||||
local -r max_attempts=5
|
||||
local attempt=1
|
||||
local -r crictl="${KUBE_HOME}/bin/crictl"
|
||||
local -r container_runtime_name="${CONTAINER_RUNTIME_NAME:-docker}"
|
||||
# We still need to use `docker ps` when container runtime is "docker". This is because
|
||||
# dockershim is still part of kubelet today. When kubelet is down, crictl pods
|
||||
# will also fail, and docker will be killed. This is undesirable especially when
|
||||
# docker live restore is disabled.
|
||||
local healthcheck_command="docker ps"
|
||||
if [[ "${CONTAINER_RUNTIME:-docker}" != "docker" ]]; then
|
||||
healthcheck_command="${crictl} pods"
|
||||
fi
|
||||
# Container runtime startup takes time. Make initial attempts before starting
|
||||
# killing the container runtime.
|
||||
until timeout 60 ${healthcheck_command} > /dev/null; do
|
||||
if (( attempt == max_attempts )); then
|
||||
echo "Max attempt ${max_attempts} reached! Proceeding to monitor container runtime healthiness."
|
||||
break
|
||||
fi
|
||||
echo "$attempt initial attempt \"${healthcheck_command}\"! Trying again in $attempt seconds..."
|
||||
sleep "$(( 2 ** attempt++ ))"
|
||||
done
|
||||
while true; do
|
||||
if ! timeout 60 ${healthcheck_command} > /dev/null; then
|
||||
echo "Container runtime ${container_runtime_name} failed!"
|
||||
systemctl kill --kill-who=main "${container_runtime_name}"
|
||||
# Wait for a while, as we don't want to kill it again before it is really up.
|
||||
sleep 120
|
||||
else
|
||||
@ -48,7 +70,7 @@ function kubelet_monitoring {
|
||||
# Print the response and/or errors.
|
||||
echo $output
|
||||
echo "Kubelet is unhealthy!"
|
||||
pkill kubelet
|
||||
systemctl kill kubelet
|
||||
# Wait for a while, as we don't want to kill it again before it is really up.
|
||||
sleep 60
|
||||
else
|
||||
@ -60,11 +82,12 @@ function kubelet_monitoring {
|
||||
|
||||
############## Main Function ################
|
||||
if [[ "$#" -ne 1 ]]; then
|
||||
echo "Usage: health-monitor.sh <docker/kubelet>"
|
||||
echo "Usage: health-monitor.sh <container-runtime/kubelet>"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
KUBE_ENV="/home/kubernetes/kube-env"
|
||||
KUBE_HOME="/home/kubernetes"
|
||||
KUBE_ENV="${KUBE_HOME}/kube-env"
|
||||
if [[ ! -e "${KUBE_ENV}" ]]; then
|
||||
echo "The ${KUBE_ENV} file does not exist!! Terminate health monitoring"
|
||||
exit 1
|
||||
@ -74,8 +97,8 @@ SLEEP_SECONDS=10
|
||||
component=$1
|
||||
echo "Start kubernetes health monitoring for ${component}"
|
||||
source "${KUBE_ENV}"
|
||||
if [[ "${component}" == "docker" ]]; then
|
||||
docker_monitoring
|
||||
if [[ "${component}" == "container-runtime" ]]; then
|
||||
container_runtime_monitoring
|
||||
elif [[ "${component}" == "kubelet" ]]; then
|
||||
kubelet_monitoring
|
||||
else
|
||||
|
@ -40,12 +40,12 @@ write_files:
|
||||
[Install]
|
||||
WantedBy=kubernetes.target
|
||||
|
||||
- path: /etc/systemd/system/kube-docker-monitor.service
|
||||
- path: /etc/systemd/system/kube-container-runtime-monitor.service
|
||||
permissions: 0644
|
||||
owner: root
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Kubernetes health monitoring for docker
|
||||
Description=Kubernetes health monitoring for container runtime
|
||||
After=kube-master-configuration.service
|
||||
|
||||
[Service]
|
||||
@ -54,7 +54,7 @@ write_files:
|
||||
RemainAfterExit=yes
|
||||
RemainAfterExit=yes
|
||||
ExecStartPre=/bin/chmod 544 /home/kubernetes/bin/health-monitor.sh
|
||||
ExecStart=/home/kubernetes/bin/health-monitor.sh docker
|
||||
ExecStart=/home/kubernetes/bin/health-monitor.sh container-runtime
|
||||
|
||||
[Install]
|
||||
WantedBy=kubernetes.target
|
||||
@ -120,7 +120,7 @@ runcmd:
|
||||
- systemctl daemon-reload
|
||||
- systemctl enable kube-master-installation.service
|
||||
- systemctl enable kube-master-configuration.service
|
||||
- systemctl enable kube-docker-monitor.service
|
||||
- systemctl enable kube-container-runtime-monitor.service
|
||||
- systemctl enable kubelet-monitor.service
|
||||
- systemctl enable kube-logrotate.timer
|
||||
- systemctl enable kube-logrotate.service
|
||||
|
@ -40,12 +40,12 @@ write_files:
|
||||
[Install]
|
||||
WantedBy=kubernetes.target
|
||||
|
||||
- path: /etc/systemd/system/kube-docker-monitor.service
|
||||
- path: /etc/systemd/system/kube-container-runtime-monitor.service
|
||||
permissions: 0644
|
||||
owner: root
|
||||
content: |
|
||||
[Unit]
|
||||
Description=Kubernetes health monitoring for docker
|
||||
Description=Kubernetes health monitoring for container runtime
|
||||
After=kube-node-configuration.service
|
||||
|
||||
[Service]
|
||||
@ -54,7 +54,7 @@ write_files:
|
||||
RemainAfterExit=yes
|
||||
RemainAfterExit=yes
|
||||
ExecStartPre=/bin/chmod 544 /home/kubernetes/bin/health-monitor.sh
|
||||
ExecStart=/home/kubernetes/bin/health-monitor.sh docker
|
||||
ExecStart=/home/kubernetes/bin/health-monitor.sh container-runtime
|
||||
|
||||
[Install]
|
||||
WantedBy=kubernetes.target
|
||||
@ -120,7 +120,7 @@ runcmd:
|
||||
- systemctl daemon-reload
|
||||
- systemctl enable kube-node-installation.service
|
||||
- systemctl enable kube-node-configuration.service
|
||||
- systemctl enable kube-docker-monitor.service
|
||||
- systemctl enable kube-container-runtime-monitor.service
|
||||
- systemctl enable kubelet-monitor.service
|
||||
- systemctl enable kube-logrotate.timer
|
||||
- systemctl enable kube-logrotate.service
|
||||
|
@ -76,14 +76,16 @@ spec:
|
||||
gcr.io/kubernetes-e2e-test-images/volume-rbd:0.1
|
||||
k8s.gcr.io/zookeeper-install-3.5.0-alpha:e2e
|
||||
gcr.io/google_samples/gb-redisslave:nonexistent
|
||||
; do echo $(date '+%X') pulling $i; docker pull $i 1>/dev/null; done; exit 0;
|
||||
; do echo $(date '+%X') pulling $i; crictl pull $i 1>/dev/null; done; exit 0;
|
||||
securityContext:
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /var/run/docker.sock
|
||||
- mountPath: {{ container_runtime_endpoint }}
|
||||
name: socket
|
||||
- mountPath: /usr/bin/docker
|
||||
name: docker
|
||||
- mountPath: /usr/bin/crictl
|
||||
name: crictl
|
||||
- mountPath: /etc/crictl.yaml
|
||||
name: config
|
||||
# Add a container that runs a health-check
|
||||
- name: nethealth-check
|
||||
resources:
|
||||
@ -98,13 +100,17 @@ spec:
|
||||
- "/usr/bin/nethealth || true"
|
||||
volumes:
|
||||
- hostPath:
|
||||
path: /var/run/docker.sock
|
||||
path: {{ container_runtime_endpoint }}
|
||||
type: Socket
|
||||
name: socket
|
||||
- hostPath:
|
||||
path: /usr/bin/docker
|
||||
path: /home/kubernetes/bin/crictl
|
||||
type: File
|
||||
name: docker
|
||||
name: crictl
|
||||
- hostPath:
|
||||
path: /etc/crictl.yaml
|
||||
type: File
|
||||
name: config
|
||||
# This pod is really fire-and-forget.
|
||||
restartPolicy: OnFailure
|
||||
# This pod needs hostNetworking for true VM perf measurement as well as avoiding cbr0 issues
|
||||
|
@ -673,7 +673,6 @@ function construct-kubelet-flags {
|
||||
if [[ -n "${CONTAINER_RUNTIME:-}" ]]; then
|
||||
flags+=" --container-runtime=${CONTAINER_RUNTIME}"
|
||||
fi
|
||||
# TODO(mtaufen): CONTAINER_RUNTIME_ENDPOINT seems unused; delete it?
|
||||
if [[ -n "${CONTAINER_RUNTIME_ENDPOINT:-}" ]]; then
|
||||
flags+=" --container-runtime-endpoint=${CONTAINER_RUNTIME_ENDPOINT}"
|
||||
fi
|
||||
|
@ -50,7 +50,7 @@ readonly gce_logfiles="startupscript"
|
||||
readonly kern_logfile="kern"
|
||||
readonly initd_logfiles="docker"
|
||||
readonly supervisord_logfiles="kubelet supervisor/supervisord supervisor/kubelet-stdout supervisor/kubelet-stderr supervisor/docker-stdout supervisor/docker-stderr"
|
||||
readonly systemd_services="kubelet ${LOG_DUMP_SYSTEMD_SERVICES:-docker}"
|
||||
readonly systemd_services="kubelet kubelet-monitor kube-container-runtime-monitor ${LOG_DUMP_SYSTEMD_SERVICES:-docker}"
|
||||
|
||||
# Limit the number of concurrent node connections so that we don't run out of
|
||||
# file descriptors for large clusters.
|
||||
|
Loading…
Reference in New Issue
Block a user