From de442ef8604305e9cf762b2d8733a9d3fcd9480f Mon Sep 17 00:00:00 2001
From: Piotr Tabor <ptab@google.com>
Date: Fri, 7 May 2021 11:45:31 +0200
Subject: [PATCH] Retry hostname->IP: [Errno -2] Name or service not known

During cluster configuration, the hostname is getting resolved to IP,
as etcd requires IP address as listening address.

Due to connectivity flakes or delayed network inititalization, sometimes
the IP fails to be resolved to a name with following error:
```
[Errno -2] Name or service not known
```
that leads to attempt to run etcd with empty flag.

The PR adds a proper retry (up to 5 minutes) in case the connectivity
problems happens.

I considered alternatives like: `getent hosts foo`, but unfortunetelly thay
can return IPv6 that etcd is not ready for (yet).
---
 cluster/gce/gci/configure-helper.sh | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/cluster/gce/gci/configure-helper.sh b/cluster/gce/gci/configure-helper.sh
index aee4ff19fef..e4537856419 100644
--- a/cluster/gce/gci/configure-helper.sh
+++ b/cluster/gce/gci/configure-helper.sh
@@ -1776,7 +1776,27 @@ function start-kube-proxy {
 # $5: pod name, which should be either etcd or etcd-events
 function prepare-etcd-manifest {
   local host_name=${ETCD_HOSTNAME:-$(hostname -s)}
-  local -r host_ip=$(python3 -c "import socket;print(socket.gethostbyname(\"${host_name}\"))")
+
+  local resolve_host_script_py='
+import socket
+import time
+import sys
+
+timeout_sec=300
+
+def resolve(host):
+  for attempt in range(timeout_sec):
+    try:
+      print(socket.gethostbyname(host))
+      break
+    except Exception as e:
+      sys.stderr.write("error: resolving host %s to IP failed: %s\n" % (host, e))
+      time.sleep(1)
+      continue
+
+'
+
+  local -r host_ip=$(python3 -c "${resolve_host_script_py}"$'\n'"resolve(\"${host_name}\")")
   local etcd_cluster=""
   local cluster_state="new"
   local etcd_protocol="http"