better handling for dns resolution failures
diff --git a/xos/hpc_observer/hpc_watcher.py b/xos/hpc_observer/hpc_watcher.py
index 15adce9..9eb8afe 100644
--- a/xos/hpc_observer/hpc_watcher.py
+++ b/xos/hpc_observer/hpc_watcher.py
@@ -390,11 +390,15 @@
ip = sliver.get_public_ip()
if not ip:
- ip = socket.gethostbyname(sliver.node.name)
+ try:
+ ip = socket.gethostbyname(sliver.node.name)
+ except:
+ self.set_status(sliver, service, "watcher.DNS", "dns resolution failure")
+ continue
- #if not ip:
- # self.set_status(sliver, service, "watcher.DNS", "no public IP")
- # continue
+ if not ip:
+ self.set_status(sliver, service, "watcher.DNS", "no IP address")
+ continue
checks = HpcHealthCheck.objects.filter(kind="dns")
if not checks:
@@ -518,7 +522,15 @@
for sliver in slivers:
ip = sliver.get_public_ip()
if not ip:
- ip = socket.gethostbyname(sliver.node.name)
+ try:
+ ip = socket.gethostbyname(sliver.node.name)
+ except:
+ self.set_status(sliver, service, "watcher.watcher", "dns resolution failure")
+ continue
+
+ if not ip:
+ self.set_status(sliver, service, "watcher.watcher", "no IP address")
+ continue
port = 8015
if ("redir" in sliver.slice.name):