hpc watcher tool
diff --git a/xos/hpc_observer/hpc_watcher.py b/xos/hpc_observer/hpc_watcher.py
new file mode 100644
index 0000000..16f1ef1
--- /dev/null
+++ b/xos/hpc_observer/hpc_watcher.py
@@ -0,0 +1,172 @@
+import os
+import sys
+sys.path.append("/opt/xos")
+os.environ.setdefault("DJANGO_SETTINGS_MODULE", "xos.settings")
+import django
+from django.contrib.contenttypes.models import ContentType
+from core.models import *

+from hpc.models import *

+from requestrouter.models import *

+django.setup()
+import time
+
+from dnslib.dns import DNSRecord,DNSHeader,DNSQuestion,QTYPE
+from dnslib.digparser import DigParser
+
+from threading import Thread, Condition
+
+class WorkQueue:
+    def __init__(self):
+        self.job_cv = Condition()
+        self.jobs = []
+        self.result_cv = Condition()
+        self.results = []
+        self.outstanding = 0
+
+    def get_job(self):
+        self.job_cv.acquire()
+        while not self.jobs:
+            self.job_cv.wait()
+        result = self.jobs.pop()
+        self.job_cv.release()
+        return result
+
+    def submit_job(self, job):
+        self.job_cv.acquire()
+        self.jobs.append(job)
+        self.job_cv.notify()
+        self.job_cv.release()
+        self.outstanding = self.outstanding + 1
+
+    def get_result(self):
+        self.result_cv.acquire()
+        while not self.results:
+            self.result_cv.wait()
+        result = self.results.pop()
+        self.result_cv.release()
+        self.outstanding = self.outstanding - 1
+        return result
+
+    def submit_result(self, result):
+        self.result_cv.acquire()
+        self.results.append(result)
+        self.result_cv.notify()
+        self.result_cv.release()
+
+class DnsResolver(Thread):
+    def __init__(self, queue):
+        Thread.__init__(self)
+        self.queue = queue
+        self.daemon = True
+        self.start()
+
+    def run(self):
+        while True:
+            job = self.queue.get_job()
+            self.handle_job(job)
+            self.queue.submit_result(job)
+
+    def handle_job(self, job):
+        domain = job["domain"]
+        server = job["server"]
+        port = job["port"]
+
+        try:
+            q = DNSRecord(q=DNSQuestion(domain, getattr(QTYPE,"A")))
+
+            a_pkt = q.send(server, port, tcp=False, timeout=10)
+            a = DNSRecord.parse(a_pkt)
+
+            found_a_record = False
+            for record in a.ar:
+                if (record.rtype==QTYPE.A):
+                    found_a_record=True
+                    print record
+
+            if not found_a_record:
+                job["status"] =  "%s,No A records" % domain
+                return
+
+        except Exception, e:
+            job["status"] = "%s,Exception: %s" % (domain, str(e))
+            return
+
+        job["status"] = "success"
+
+class HpcWatcher:
+    def __init__(self):
+        self.resolver_queue = WorkQueue()
+        for i in range(0,10):
+            DnsResolver(queue = self.resolver_queue)
+
+    def set_status(self, sliver, service, kind, msg):
+        print sliver.node.name, kind, msg
+        sliver.has_error = (msg!="success")
+
+        sliver_type = ContentType.objects.get_for_model(sliver)
+
+        t = Tag.objects.filter(service=service, name=kind+".msg", content_type__pk=sliver_type.id, object_id=sliver.id)
+        if t:
+            t=t[0]
+            if (t.value != msg):
+                t.value = msg
+                t.save()
+        else:
+            Tag(service=service, name=kind+".msg", content_object = sliver, value=msg).save()
+
+        t = Tag.objects.filter(service=service, name=kind+".time", content_type__pk=sliver_type.id, object_id=sliver.id)
+        if t:
+            t=t[0]
+            t.value = str(time.time())
+            t.save()
+        else:
+            Tag(service=service, name=kind+".time", content_object = sliver, value=str(time.time())).save()
+
+    def check_request_routers(self, service, slivers):
+        for sliver in slivers:
+            sliver.has_error = False
+
+            ip = sliver.get_public_ip(sliver)
+            if not ip:
+                self.set_status(sliver, service, "watcher.DNS", "no public IP")
+                continue
+
+            for domain in ["onlab1.vicci.org"]:
+                q = DNSRecord(q=DNSQuestion(domain, getattr(QTYPE,"A")))
+
+                self.resolver_queue.submit_job({"domain": domain, "server": ip, "port": 53, "sliver": sliver})
+
+        print self.resolver_queue.outstanding
+
+        while self.resolver_queue.outstanding > 0:
+            result = self.resolver_queue.get_result()
+            sliver = result["sliver"]
+            if (result["status"]!="success") and (not sliver.has_error):
+                self.set_status(sliver, service, "watcher.DNS", result["status"])
+
+        for sliver in slivers:
+            if not sliver.has_error:
+                self.set_status(sliver, service, "watcher.DNS", "success")
+
+    def get_service_slices(self, service, kind):
+        try:
+            slices = service.slices.all()
+        except:
+            # buggy data model
+            slices = service.service.all()
+
+        return [x for x in slices if (kind in x.name)]
+
+    def run_once(self):
+        for hpcService in HpcService.objects.all():
+            for slice in self.get_service_slices(hpcService, "dnsdemux"):
+                self.check_request_routers(hpcService, slice.slivers.all())
+
+        for rrService in RequestRouterService.objects.all():
+            for slice in self.get_service_slices(rrService, "dnsdemux"):
+                self.check_request_routers(rrService, slice.slivers.all())
+
+
+if __name__ == "__main__":
+    HpcWatcher().run_once()
+