check hpc heartbeat
diff --git a/xos/hpc_observer/hpc_watcher.py b/xos/hpc_observer/hpc_watcher.py
index 79d212e..7dcd079 100644
--- a/xos/hpc_observer/hpc_watcher.py
+++ b/xos/hpc_observer/hpc_watcher.py
@@ -9,6 +9,9 @@
from requestrouter.models import *
django.setup()
import time
+import pycurl
+import traceback
+from StringIO import StringIO
from dnslib.dns import DNSRecord,DNSHeader,DNSQuestion,QTYPE
from dnslib.digparser import DigParser
@@ -81,7 +84,6 @@
for record in a.ar:
if (record.rtype==QTYPE.A):
found_a_record=True
- print record
if not found_a_record:
job["status"] = "%s,No A records" % domain
@@ -93,14 +95,73 @@
job["status"] = "success"
+class HpcHeartbeat(Thread):
+ def __init__(self, queue):
+ Thread.__init__(self)
+ self.queue = queue
+ self.daemon = True
+ self.start()
+
+ def run(self):
+ while True:
+ job = self.queue.get_job()
+ self.handle_job(job)
+ self.queue.submit_result(job)
+
+ def curl_error_message(self, e):
+ if e.args[0] == 6:
+ return "couldn't resolve host"
+ if e.args[0] == 7:
+ return "failed to connect"
+ return "curl error %d" % e.args[0]
+
+ def handle_job(self, job):
+ server = job["server"]
+ port = job["port"]
+
+ try:
+ buffer = StringIO()
+ c = pycurl.Curl()
+
+ c.setopt(c.URL, "http://%s:%s/heartbeat" % (server, port))
+ c.setopt(c.WRITEDATA, buffer)
+ c.setopt(c.HTTPHEADER, ['host: hpc-heartbeat', 'X-heartbeat: 1'])
+ c.setopt(c.TIMEOUT, 10)
+ c.setopt(c.CONNECTTIMEOUT, 10)
+ c.setopt(c.NOSIGNAL, 1)
+
+ try:
+ c.perform()
+ response_code = c.getinfo(c.RESPONSE_CODE)
+ except Exception, e:
+ #traceback.print_exc()
+ job["status"] = self.curl_error_message(e)
+ return
+ finally:
+ c.close()
+
+ if response_code != 200:
+ job["status"] = "error response %d" % c.getinfo(c.RESPONSE_CODE)
+ return
+
+ except Exception, e:
+ job["status"] = "Exception: %s" % str(e)
+ return
+
+ job["status"] = "success"
+
class HpcWatcher:
def __init__(self):
self.resolver_queue = WorkQueue()
for i in range(0,10):
DnsResolver(queue = self.resolver_queue)
+ self.heartbeat_queue = WorkQueue()
+ for i in range(0, 10):
+ HpcHeartbeat(queue = self.heartbeat_queue)
+
def set_status(self, sliver, service, kind, msg):
- print sliver.node.name, kind, msg
+ #print sliver.node.name, kind, msg
sliver.has_error = (msg!="success")
sliver_type = ContentType.objects.get_for_model(sliver)
@@ -131,13 +192,9 @@
self.set_status(sliver, service, "watcher.DNS", "no public IP")
continue
- for domain in ["onlab1.vicci.org"]:
- q = DNSRecord(q=DNSQuestion(domain, getattr(QTYPE,"A")))
-
+ for domain in ["onlab.vicci.org"]:
self.resolver_queue.submit_job({"domain": domain, "server": ip, "port": 53, "sliver": sliver})
- print self.resolver_queue.outstanding
-
while self.resolver_queue.outstanding > 0:
result = self.resolver_queue.get_result()
sliver = result["sliver"]
@@ -148,6 +205,22 @@
if not sliver.has_error:
self.set_status(sliver, service, "watcher.DNS", "success")
+ def probe_hpc(self, service, slivers):
+ for sliver in slivers:
+ sliver.has_error = False
+
+ self.heartbeat_queue.submit_job({"server": sliver.node.name, "port": 8009, "sliver": sliver})
+
+ while self.heartbeat_queue.outstanding > 0:
+ result = self.heartbeat_queue.get_result()
+ sliver = result["sliver"]
+ if (result["status"]!="success") and (not sliver.has_error):
+ self.set_status(sliver, service, "watcher.HPC-hb", result["status"])
+
+ for sliver in slivers:
+ if not sliver.has_error:
+ self.set_status(sliver, service, "watcher.HPC-hb", "success")
+
def get_service_slices(self, service, kind):
try:
slices = service.slices.all()
@@ -166,7 +239,18 @@
for slice in self.get_service_slices(rrService, "dnsdemux"):
self.check_request_routers(rrService, slice.slivers.all())
+ for hpcService in HpcService.objects.all():
+ for slice in self.get_service_slices(hpcService, "hpc"):
+ self.probe_hpc(hpcService, slice.slivers.all())
+
+ def run_loop(self):
+ while True:
+ self.run_once()
+ time.sleep(10)
if __name__ == "__main__":
- HpcWatcher().run_once()
+ if "--once" in sys.argv:
+ HpcWatcher().run_once()
+ else:
+ HpcWatcher().run_loop()