Periodically fetch logexported nodes instead of sleeping

This commit is contained in:
wojtekt 2018-06-15 15:55:03 +02:00
parent 0f9dfca8e7
commit 43d217f904

View File

@ -281,6 +281,24 @@ function dump_nodes() {
fi fi
} }
# Collect names of nodes which didn't run logexporter successfully.
# Note: This step is O(#nodes^2) as we check if each node is present in the list of succeeded nodes.
# Making it linear would add code complexity without much benefit (as it just takes ~1s for 5k nodes).
# Assumes:
# NODE_NAMES
# Sets:
# NON_LOGEXPORTED_NODES
function find_non_logexported_nodes() {
succeeded_nodes=$(gsutil ls ${gcs_artifacts_dir}/logexported-nodes-registry) || return 1
echo "Successfully listed marker files for successful nodes"
NON_LOGEXPORTED_NODES=()
for node in "${NODE_NAMES[@]}"; do
if [[ ! "${succeeded_nodes}" =~ "${node}" ]]; then
NON_LOGEXPORTED_NODES+=("${node}")
fi
done
}
function dump_nodes_with_logexporter() { function dump_nodes_with_logexporter() {
echo "Detecting nodes in the cluster" echo "Detecting nodes in the cluster"
detect-node-names &> /dev/null detect-node-names &> /dev/null
@ -312,14 +330,27 @@ function dump_nodes_with_logexporter() {
return return
fi fi
# Give some time for the pods to finish uploading logs. # Periodically fetch list of already logexported nodes to verify
sleep "${logexport_sleep_seconds}" # if we aren't already done.
start="$(date +%s)"
while true; do
now="$(date +%s)"
if [[ $((now - start)) -gt ${logexport_sleep_seconds} ]]; then
echo "Waiting for all nodes to be logexported timed out."
break
fi
if find_non_logexported_nodes; then
if [[ -z "${NON_LOGEXPORTED_NODES:-}" ]]; then
break
fi
fi
sleep 15
done
# List registry of marker files (of nodes whose logexporter succeeded) from GCS. # List registry of marker files (of nodes whose logexporter succeeded) from GCS.
local nodes_succeeded local nodes_succeeded
for retry in {1..10}; do for retry in {1..10}; do
if nodes_succeeded=$(gsutil ls ${gcs_artifacts_dir}/logexported-nodes-registry); then if find_non_logexported_nodes; then
echo "Successfully listed marker files for successful nodes"
break break
else else
echo "Attempt ${retry} failed to list marker files for succeessful nodes" echo "Attempt ${retry} failed to list marker files for succeessful nodes"
@ -333,15 +364,10 @@ function dump_nodes_with_logexporter() {
fi fi
done done
# Collect names of nodes which didn't run logexporter successfully.
# Note: This step is O(#nodes^2) as we check if each node is present in the list of succeeded nodes.
# Making it linear would add code complexity without much benefit (as it just takes ~1s for 5k nodes).
failed_nodes=() failed_nodes=()
for node in "${NODE_NAMES[@]}"; do for node in "${NON_LOGEXPORTED_NODES[@]:-}"; do
if [[ ! "${nodes_succeeded}" =~ "${node}" ]]; then echo "Logexporter didn't succeed on node ${node}. Queuing it for logdump through SSH."
echo "Logexporter didn't succeed on node ${node}. Queuing it for logdump through SSH." failed_nodes+=("${node}")
failed_nodes+=("${node}")
fi
done done
# Delete the logexporter resources and dump logs for the failed nodes (if any) through SSH. # Delete the logexporter resources and dump logs for the failed nodes (if any) through SSH.