From da23396e22fcc4f55000cf82309334f17bfbebac Mon Sep 17 00:00:00 2001 From: Tomoe Sugihara Date: Mon, 21 May 2018 11:38:41 +0900 Subject: [PATCH] Dump Stack when docker fails on healthcheck Send SIGUSR1 to dockerd to save stack dump of docker daemon in order to be able to investigate why docker daemon was unresposive to health check done by `docker ps`. See https://github.com/moby/moby/blob/master/daemon/daemon.go on how docker sets up a trap for SIGUSR1 with `setupDumpStackTrap()` --- cluster/gce/gci/health-monitor.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cluster/gce/gci/health-monitor.sh b/cluster/gce/gci/health-monitor.sh index cbf96dd3142..b2bca3d2a8b 100644 --- a/cluster/gce/gci/health-monitor.sh +++ b/cluster/gce/gci/health-monitor.sh @@ -50,6 +50,12 @@ function container_runtime_monitoring { while true; do if ! timeout 60 ${healthcheck_command} > /dev/null; then echo "Container runtime ${container_runtime_name} failed!" + if [[ "$container_runtime_name" == "docker" ]]; then + # Dump stack of docker daemon for investigation. + # Log fle name looks like goroutine-stacks-TIMESTAMP and will be saved to + # the exec root directory, which is /var/run/docker/ on Ubuntu and COS. + pkill -SIGUSR1 dockerd + fi systemctl kill --kill-who=main "${container_runtime_name}" # Wait for a while, as we don't want to kill it again before it is really up. sleep 120