diff --git a/cluster/log-dump/log-dump.sh b/cluster/log-dump/log-dump.sh index 719ee31aba0..4dd312b96fd 100755 --- a/cluster/log-dump/log-dump.sh +++ b/cluster/log-dump/log-dump.sh @@ -281,6 +281,24 @@ function dump_nodes() { fi } +# Collect names of nodes which didn't run logexporter successfully. +# Note: This step is O(#nodes^2) as we check if each node is present in the list of succeeded nodes. +# Making it linear would add code complexity without much benefit (as it just takes ~1s for 5k nodes). +# Assumes: +# NODE_NAMES +# Sets: +# NON_LOGEXPORTED_NODES +function find_non_logexported_nodes() { + succeeded_nodes=$(gsutil ls ${gcs_artifacts_dir}/logexported-nodes-registry) || return 1 + echo "Successfully listed marker files for successful nodes" + NON_LOGEXPORTED_NODES=() + for node in "${NODE_NAMES[@]}"; do + if [[ ! "${succeeded_nodes}" =~ "${node}" ]]; then + NON_LOGEXPORTED_NODES+=("${node}") + fi + done +} + function dump_nodes_with_logexporter() { echo "Detecting nodes in the cluster" detect-node-names &> /dev/null @@ -312,14 +330,27 @@ function dump_nodes_with_logexporter() { return fi - # Give some time for the pods to finish uploading logs. - sleep "${logexport_sleep_seconds}" + # Periodically fetch list of already logexported nodes to verify + # if we aren't already done. + start="$(date +%s)" + while true; do + now="$(date +%s)" + if [[ $((now - start)) -gt ${logexport_sleep_seconds} ]]; then + echo "Waiting for all nodes to be logexported timed out." + break + fi + if find_non_logexported_nodes; then + if [[ -z "${NON_LOGEXPORTED_NODES:-}" ]]; then + break + fi + fi + sleep 15 + done # List registry of marker files (of nodes whose logexporter succeeded) from GCS. local nodes_succeeded for retry in {1..10}; do - if nodes_succeeded=$(gsutil ls ${gcs_artifacts_dir}/logexported-nodes-registry); then - echo "Successfully listed marker files for successful nodes" + if find_non_logexported_nodes; then break else echo "Attempt ${retry} failed to list marker files for succeessful nodes" @@ -333,15 +364,10 @@ function dump_nodes_with_logexporter() { fi done - # Collect names of nodes which didn't run logexporter successfully. - # Note: This step is O(#nodes^2) as we check if each node is present in the list of succeeded nodes. - # Making it linear would add code complexity without much benefit (as it just takes ~1s for 5k nodes). failed_nodes=() - for node in "${NODE_NAMES[@]}"; do - if [[ ! "${nodes_succeeded}" =~ "${node}" ]]; then - echo "Logexporter didn't succeed on node ${node}. Queuing it for logdump through SSH." - failed_nodes+=("${node}") - fi + for node in "${NON_LOGEXPORTED_NODES[@]:-}"; do + echo "Logexporter didn't succeed on node ${node}. Queuing it for logdump through SSH." + failed_nodes+=("${node}") done # Delete the logexporter resources and dump logs for the failed nodes (if any) through SSH.