check hpc heartbeat
diff --git a/xos/hpc_observer/hpc_watcher.py b/xos/hpc_observer/hpc_watcher.py
index 79d212e..7dcd079 100644
--- a/xos/hpc_observer/hpc_watcher.py
+++ b/xos/hpc_observer/hpc_watcher.py
@@ -9,6 +9,9 @@
 from requestrouter.models import *

 django.setup()
 import time
+import pycurl
+import traceback
+from StringIO import StringIO
 
 from dnslib.dns import DNSRecord,DNSHeader,DNSQuestion,QTYPE
 from dnslib.digparser import DigParser
@@ -81,7 +84,6 @@
             for record in a.ar:
                 if (record.rtype==QTYPE.A):
                     found_a_record=True
-                    print record
 
             if not found_a_record:
                 job["status"] =  "%s,No A records" % domain
@@ -93,14 +95,73 @@
 
         job["status"] = "success"
 
+class HpcHeartbeat(Thread):
+    def __init__(self, queue):
+        Thread.__init__(self)
+        self.queue = queue
+        self.daemon = True
+        self.start()
+
+    def run(self):
+        while True:
+            job = self.queue.get_job()
+            self.handle_job(job)
+            self.queue.submit_result(job)
+
+    def curl_error_message(self, e):
+        if e.args[0] == 6:
+            return "couldn't resolve host"
+        if e.args[0] == 7:
+            return "failed to connect"
+        return "curl error %d" % e.args[0]
+
+    def handle_job(self, job):
+        server = job["server"]
+        port = job["port"]
+
+        try:
+            buffer = StringIO()
+            c = pycurl.Curl()
+
+            c.setopt(c.URL, "http://%s:%s/heartbeat" % (server, port))
+            c.setopt(c.WRITEDATA, buffer)
+            c.setopt(c.HTTPHEADER, ['host: hpc-heartbeat', 'X-heartbeat: 1'])
+            c.setopt(c.TIMEOUT, 10)
+            c.setopt(c.CONNECTTIMEOUT, 10)
+            c.setopt(c.NOSIGNAL, 1)
+
+            try:
+                c.perform()
+                response_code = c.getinfo(c.RESPONSE_CODE)
+            except Exception, e:
+                #traceback.print_exc()
+                job["status"] = self.curl_error_message(e)
+                return
+            finally:
+                c.close()
+
+            if response_code != 200:
+                job["status"] = "error response %d" % c.getinfo(c.RESPONSE_CODE)
+                return
+
+        except Exception, e:
+            job["status"] = "Exception: %s" % str(e)
+            return
+
+        job["status"] = "success"
+
 class HpcWatcher:
     def __init__(self):
         self.resolver_queue = WorkQueue()
         for i in range(0,10):
             DnsResolver(queue = self.resolver_queue)
 
+        self.heartbeat_queue = WorkQueue()
+        for i in range(0, 10):
+            HpcHeartbeat(queue = self.heartbeat_queue)
+
     def set_status(self, sliver, service, kind, msg):
-        print sliver.node.name, kind, msg
+        #print sliver.node.name, kind, msg
         sliver.has_error = (msg!="success")
 
         sliver_type = ContentType.objects.get_for_model(sliver)
@@ -131,13 +192,9 @@
                 self.set_status(sliver, service, "watcher.DNS", "no public IP")
                 continue
 
-            for domain in ["onlab1.vicci.org"]:
-                q = DNSRecord(q=DNSQuestion(domain, getattr(QTYPE,"A")))
-
+            for domain in ["onlab.vicci.org"]:
                 self.resolver_queue.submit_job({"domain": domain, "server": ip, "port": 53, "sliver": sliver})
 
-        print self.resolver_queue.outstanding
-
         while self.resolver_queue.outstanding > 0:
             result = self.resolver_queue.get_result()
             sliver = result["sliver"]
@@ -148,6 +205,22 @@
             if not sliver.has_error:
                 self.set_status(sliver, service, "watcher.DNS", "success")
 
+    def probe_hpc(self, service, slivers):
+        for sliver in slivers:
+            sliver.has_error = False
+
+            self.heartbeat_queue.submit_job({"server": sliver.node.name, "port": 8009, "sliver": sliver})
+
+        while self.heartbeat_queue.outstanding > 0:
+            result = self.heartbeat_queue.get_result()
+            sliver = result["sliver"]
+            if (result["status"]!="success") and (not sliver.has_error):
+                self.set_status(sliver, service, "watcher.HPC-hb", result["status"])
+
+        for sliver in slivers:
+            if not sliver.has_error:
+                self.set_status(sliver, service, "watcher.HPC-hb", "success")
+
     def get_service_slices(self, service, kind):
         try:
             slices = service.slices.all()
@@ -166,7 +239,18 @@
             for slice in self.get_service_slices(rrService, "dnsdemux"):
                 self.check_request_routers(rrService, slice.slivers.all())
 
+        for hpcService in HpcService.objects.all():
+            for slice in self.get_service_slices(hpcService, "hpc"):
+                self.probe_hpc(hpcService, slice.slivers.all())
+
+    def run_loop(self):
+        while True:
+            self.run_once()
+            time.sleep(10)
 
 if __name__ == "__main__":
-    HpcWatcher().run_once()
+    if "--once" in sys.argv:
+        HpcWatcher().run_once()
+    else:
+        HpcWatcher().run_loop()