mirror of
				https://github.com/k3s-io/kubernetes.git
				synced 2025-10-31 05:40:42 +00:00 
			
		
		
		
	Periodically fetch logexported nodes instead of sleeping
This commit is contained in:
		| @@ -281,6 +281,24 @@ function dump_nodes() { | |||||||
|   fi |   fi | ||||||
| } | } | ||||||
|  |  | ||||||
|  | # Collect names of nodes which didn't run logexporter successfully. | ||||||
|  | # Note: This step is O(#nodes^2) as we check if each node is present in the list of succeeded nodes. | ||||||
|  | # Making it linear would add code complexity without much benefit (as it just takes ~1s for 5k nodes). | ||||||
|  | # Assumes: | ||||||
|  | #   NODE_NAMES | ||||||
|  | # Sets: | ||||||
|  | #   NON_LOGEXPORTED_NODES | ||||||
|  | function find_non_logexported_nodes() { | ||||||
|  |   succeeded_nodes=$(gsutil ls ${gcs_artifacts_dir}/logexported-nodes-registry) || return 1 | ||||||
|  |   echo "Successfully listed marker files for successful nodes" | ||||||
|  |   NON_LOGEXPORTED_NODES=() | ||||||
|  |   for node in "${NODE_NAMES[@]}"; do | ||||||
|  |     if [[ ! "${succeeded_nodes}" =~ "${node}" ]]; then | ||||||
|  |       NON_LOGEXPORTED_NODES+=("${node}") | ||||||
|  |     fi | ||||||
|  |   done | ||||||
|  | } | ||||||
|  |  | ||||||
| function dump_nodes_with_logexporter() { | function dump_nodes_with_logexporter() { | ||||||
|   echo "Detecting nodes in the cluster" |   echo "Detecting nodes in the cluster" | ||||||
|   detect-node-names &> /dev/null |   detect-node-names &> /dev/null | ||||||
| @@ -312,14 +330,27 @@ function dump_nodes_with_logexporter() { | |||||||
|     return |     return | ||||||
|   fi |   fi | ||||||
|  |  | ||||||
|   # Give some time for the pods to finish uploading logs. |   # Periodically fetch list of already logexported nodes to verify | ||||||
|   sleep "${logexport_sleep_seconds}" |   # if we aren't already done. | ||||||
|  |   start="$(date +%s)" | ||||||
|  |   while true; do | ||||||
|  |     now="$(date +%s)" | ||||||
|  |     if [[ $((now - start)) -gt ${logexport_sleep_seconds} ]]; then | ||||||
|  |       echo "Waiting for all nodes to be logexported timed out." | ||||||
|  |       break | ||||||
|  |     fi | ||||||
|  |     if find_non_logexported_nodes; then | ||||||
|  |       if [[ -z "${NON_LOGEXPORTED_NODES:-}" ]]; then | ||||||
|  |         break | ||||||
|  |       fi | ||||||
|  |     fi | ||||||
|  |     sleep 15 | ||||||
|  |   done | ||||||
|  |  | ||||||
|   # List registry of marker files (of nodes whose logexporter succeeded) from GCS. |   # List registry of marker files (of nodes whose logexporter succeeded) from GCS. | ||||||
|   local nodes_succeeded |   local nodes_succeeded | ||||||
|   for retry in {1..10}; do |   for retry in {1..10}; do | ||||||
|     if nodes_succeeded=$(gsutil ls ${gcs_artifacts_dir}/logexported-nodes-registry); then |     if find_non_logexported_nodes; then | ||||||
|       echo "Successfully listed marker files for successful nodes" |  | ||||||
|       break |       break | ||||||
|     else |     else | ||||||
|       echo "Attempt ${retry} failed to list marker files for succeessful nodes" |       echo "Attempt ${retry} failed to list marker files for succeessful nodes" | ||||||
| @@ -333,15 +364,10 @@ function dump_nodes_with_logexporter() { | |||||||
|     fi |     fi | ||||||
|   done |   done | ||||||
|  |  | ||||||
|   # Collect names of nodes which didn't run logexporter successfully. |  | ||||||
|   # Note: This step is O(#nodes^2) as we check if each node is present in the list of succeeded nodes. |  | ||||||
|   # Making it linear would add code complexity without much benefit (as it just takes ~1s for 5k nodes). |  | ||||||
|   failed_nodes=() |   failed_nodes=() | ||||||
|   for node in "${NODE_NAMES[@]}"; do |   for node in "${NON_LOGEXPORTED_NODES[@]:-}"; do | ||||||
|     if [[ ! "${nodes_succeeded}" =~ "${node}" ]]; then |     echo "Logexporter didn't succeed on node ${node}. Queuing it for logdump through SSH." | ||||||
|       echo "Logexporter didn't succeed on node ${node}. Queuing it for logdump through SSH." |     failed_nodes+=("${node}") | ||||||
|       failed_nodes+=("${node}") |  | ||||||
|     fi |  | ||||||
|   done |   done | ||||||
|  |  | ||||||
|   # Delete the logexporter resources and dump logs for the failed nodes (if any) through SSH. |   # Delete the logexporter resources and dump logs for the failed nodes (if any) through SSH. | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user