planetstack/hpc_wizard/bigquery_analytics.py - xos - Gitiles

 import re
 import base64
 import requests
 import urllib
 import json
 import httplib2
 import threading
 import os
 import sys
 import time
 import traceback

 from apiclient.discovery import build
 from apiclient.errors import HttpError
 from oauth2client.client import AccessTokenRefreshError
 from oauth2client.client import OAuth2WebServerFlow
 from oauth2client.client import flow_from_clientsecrets
 from oauth2client.file import Storage
 from oauth2client.tools import run_flow,run

 """
 yum -y install python-httplib2
 easy_install python_gflags
 easy_install google_api_python_client
 """

 PROJECT_NUMBER = '549187599759'

 try:
     FLOW = flow_from_clientsecrets('/opt/planetstack/hpc_wizard/client_secrets.json',
                                    scope='https://www.googleapis.com/auth/bigquery')
     BIGQUERY_AVAILABLE = True
 except:
     print >> sys.stderr, "exception while initializing bigquery flow"
     traceback.print_exc()
     FLOW = None
     BIGQUERY_AVAILABLE = False

 MINUTE_MS = 60*1000
 HOUR_MS = 60*60*1000

 # global to hold cached mappings
 mappings = {}
 reverse_mappings = {}

 def to_number(s):
    try:
        if "." in str(s):
            return float(s)
        else:
            return int(s)
    except:
        return 0

 class MappingException(Exception):
     pass

 class BigQueryAnalytics:
     def __init__(self, table = "demoevents"):
         self.projectName = "vicci"
         self.tableName = table

     def reload_mapping(self):
         global mappings, reverse_mappings
         mappings[self.tableName] = json.loads(self.fetch_mapping(table=self.tableName))
         reverse_mappings[self.tableName] = {v:k for k, v in mappings[self.tableName].items()}

     def fetch_mapping(self, m=0, table="events"):
 	req = 'http://cloud-scrutiny.appspot.com/command?action=get_allocations&multiplexer=%d&table=%s'% (m,table)
 	resp = requests.get(req)
 	if (resp.status_code==200):
 		return resp.text
 	else:
 		raise Exception('Error accessing register allocations: %d'%resp.status_code)

     def run_query_raw(self, query):
         try:
             file("/tmp/query_log","a").write("query %s\n" % query)
         except:
             pass

         p = re.compile('%[a-zA-z_]*')

         try:
             query = p.sub(self.remap, query)
         except MappingException:
             self.reload_mapping()
             query = p.sub(self.remap, query)

         try:
             file("/tmp/query_log","a").write("remapped query %s\n" % query)
         except:
             pass

 	storage = Storage('/opt/planetstack/hpc_wizard/bigquery_credentials.dat')
  	credentials = storage.get()

 	if credentials is None or credentials.invalid:
             credentials = run(FLOW, storage)

 	http = httplib2.Http()
 	http = credentials.authorize(http)

 	service = build('bigquery', 'v2', http=http)

         body = {"query": query,
                 "timeoutMs": 60000}
         response = service.jobs().query(projectId=PROJECT_NUMBER, body=body).execute()

         return response

     def translate_schema(self, response):
         for field in response["schema"]["fields"]:
             field["name"] = reverse_mappings[self.tableName].get(field["name"], field["name"])

     def run_query(self, query):
         if not BIGQUERY_AVAILABLE:
             print >> sys.stderr, "bigquery_analytics: bigquery flow is not available. returning empty result."
             return []

         response = self.run_query_raw(query)

         fieldNames = []
         for field in response["schema"]["fields"]:
             fieldNames.append(field["name"])

         result = []
         if "rows" in response:
             for row in response["rows"]:
                 this_result = {}
                 for (i,column) in enumerate(row["f"]):
                     this_result[reverse_mappings[self.tableName].get(fieldNames[i],fieldNames[i])] = column["v"]
                 result.append(this_result)

         return result

     """ Filter_results, groupby_results, do_computed_fields, and postprocess_results
         are all used for postprocessing queries. The idea is to do one query that
         includes the ungrouped and unfiltered data, and cache it for multiple
         consumers who will filter and group it as necessary.

         TODO: Find a more generalized source for these sorts operations. Perhaps
         put the results in SQLite and then run SQL queries against it.
     """

     def filter_results(self, rows, name, value):
         result = [row for row in rows if row.get(name)==value]
         return result

     def groupby_results(self, rows, groupBy=[], sum=[], count=[], avg=[], maxi=[]):
         new_rows = {}
         for row in rows:
             groupby_key = [row.get(k, None) for k in groupBy]

             if str(groupby_key) not in new_rows:
                 new_row = {}
                 for k in groupBy:
                     new_row[k] = row.get(k, None)

                 new_rows[str(groupby_key)] = new_row
             else:
                 new_row = new_rows[str(groupby_key)]

             for k in sum:
                 new_row["sum_" + k] = new_row.get("sum_" + k, 0) + to_number(row.get(k,0))

             for k in avg:
                 new_row["avg_" + k] = new_row.get("avg_" + k, 0) + to_number(row.get(k,0))
                 new_row["avg_base_" + k] = new_row.get("avg_base_"+k,0) + 1

             for k in maxi:
                 new_row["max_" + k] = max(new_row.get("max_" + k, 0), to_number(row.get(k,0)))

             for k in count:
                 v = row.get(k,None)
                 dl = new_row["distinct_" + k] = new_row.get("distinct_" + k, [])
                 if (v not in dl):
                     dl.append(v)

                 #new_row["count_" + k] = new_row.get("count_" + k, 0) + 1

         for row in new_rows.values():
             for k in avg:
                 row["avg_" + k] = float(row["avg_" + k]) / row["avg_base_" + k]
                 del row["avg_base_" + k]

             for k in count:
                 new_row["count_" + k] = len(new_row.get("distinct_" + k, []))

         return new_rows.values()

     def do_computed_fields(self, rows, computed=[]):
         computedFieldNames=[]
         for row in rows:
             for k in computed:
                 if "/" in k:
                     parts = k.split("/")
                     computedFieldName = "computed_" + parts[0].replace("%","")+"_div_"+parts[1].replace("%","")
                     try:
                         row[computedFieldName] = to_number(row[parts[0]]) / to_number(row[parts[1]])
                     except:
                         pass

                     if computedFieldName not in computedFieldNames:
                         computedFieldNames.append(computedFieldName)
         return (computedFieldNames, rows)

     def postprocess_results(self, rows, filter={}, groupBy=[], sum=[], count=[], avg=[], computed=[], maxi=[], maxDeltaTime=None):
         sum = [x.replace("%","") for x in sum]
         count = [x.replace("%","") for x in count]
         avg = [x.replace("%","") for x in avg]
         computed = [x.replace("%","") for x in computed]
         maxi = [x.replace("%","") for x in maxi]
         groupBy = [x.replace("%","") for x in groupBy]

         for (k,v) in filter.items():
             rows = self.filter_results(rows, k, v)

         if rows:
             if maxDeltaTime is not None:
                 maxTime = max([float(row["time"]) for row in rows])
                 rows = [row for row in rows if float(row["time"])>=maxTime-maxDeltaTime]

         (computedFieldNames, rows) = self.do_computed_fields(rows, computed)
         sum = sum + computedFieldNames
         if groupBy:
             rows = self.groupby_results(rows, groupBy, sum, count, avg, maxi)
         return rows

     def remap(self, match):
         if not self.tableName in mappings:
             raise MappingException("no mapping for table %s" % self.tableName)

         mapping = mappings[self.tableName]

         token = match.group()[1:]
         if token in mapping:
             return mapping[token]
         else:
             raise MappingException('unknown token %s' % token)

     def dump_table(self, rows, keys=None):
         if not keys:
             keys = rows[0].keys()

         lens = {}
         for key in keys:
             lens[key] = len(key)

         for row in rows:
             for key in keys:
                 thislen = len(str(row.get(key,"")))
                 lens[key] = max(lens.get(key,0), thislen)

         for key in keys:
             print "%*s" % (lens[key], key),
         print

         for row in rows:
             for key in keys:
                 print "%*s" % (lens[key], str(row.get(key,""))),
             print

     def schema_to_cols(self, schema):
         fields = schema["fields"]

         colTypes = {"STRING": "string", "INTEGER": "number", "FLOAT": "number", "TIMESTAMP": "date"}

         cols = []
         i=0
         for field in fields:
             col = {"type": colTypes[field["type"]],
                    "id": "Col%d" % i,
                    "label": reverse_mappings[self.tableName].get(field["name"],field["name"])}
             cols.append(col)
             i=i+1

         return cols

 def main():
     bq = BigQueryAnalytics()

     rows = bq.run_query("select %hostname,SUM(%bytes_sent) from [vicci.demoevents] group by %hostname")

     bq.dump_table(rows)

 if __name__ == "__main__":
     main()
	import re
	import base64
	import requests
	import urllib
	import json
	import httplib2
	import threading
	import os
	import sys
	import time
	import traceback

	from apiclient.discovery import build
	from apiclient.errors import HttpError
	from oauth2client.client import AccessTokenRefreshError
	from oauth2client.client import OAuth2WebServerFlow
	from oauth2client.client import flow_from_clientsecrets
	from oauth2client.file import Storage
	from oauth2client.tools import run_flow,run

	"""
	yum -y install python-httplib2
	easy_install python_gflags
	easy_install google_api_python_client
	"""

	PROJECT_NUMBER = '549187599759'

	try:
	FLOW = flow_from_clientsecrets('/opt/planetstack/hpc_wizard/client_secrets.json',
	scope='https://www.googleapis.com/auth/bigquery')
	BIGQUERY_AVAILABLE = True
	except:
	print >> sys.stderr, "exception while initializing bigquery flow"
	traceback.print_exc()
	FLOW = None
	BIGQUERY_AVAILABLE = False

	MINUTE_MS = 60*1000
	HOUR_MS = 60601000

	# global to hold cached mappings
	mappings = {}
	reverse_mappings = {}

	def to_number(s):
	try:
	if "." in str(s):
	return float(s)
	else:
	return int(s)
	except:
	return 0

	class MappingException(Exception):
	pass

	class BigQueryAnalytics:
	def __init__(self, table = "demoevents"):
	self.projectName = "vicci"
	self.tableName = table

	def reload_mapping(self):
	global mappings, reverse_mappings
	mappings[self.tableName] = json.loads(self.fetch_mapping(table=self.tableName))
	reverse_mappings[self.tableName] = {v:k for k, v in mappings[self.tableName].items()}

	def fetch_mapping(self, m=0, table="events"):
	req = 'http://cloud-scrutiny.appspot.com/command?action=get_allocations&multiplexer=%d&table=%s'% (m,table)
	resp = requests.get(req)
	if (resp.status_code==200):
	return resp.text
	else:
	raise Exception('Error accessing register allocations: %d'%resp.status_code)

	def run_query_raw(self, query):
	try:
	file("/tmp/query_log","a").write("query %s\n" % query)
	except:
	pass

	p = re.compile('%[a-zA-z_]*')

	try:
	query = p.sub(self.remap, query)
	except MappingException:
	self.reload_mapping()
	query = p.sub(self.remap, query)

	try:
	file("/tmp/query_log","a").write("remapped query %s\n" % query)
	except:
	pass

	storage = Storage('/opt/planetstack/hpc_wizard/bigquery_credentials.dat')
	credentials = storage.get()

	if credentials is None or credentials.invalid:
	credentials = run(FLOW, storage)

	http = httplib2.Http()
	http = credentials.authorize(http)

	service = build('bigquery', 'v2', http=http)

	body = {"query": query,
	"timeoutMs": 60000}
	response = service.jobs().query(projectId=PROJECT_NUMBER, body=body).execute()

	return response

	def translate_schema(self, response):
	for field in response["schema"]["fields"]:
	field["name"] = reverse_mappings[self.tableName].get(field["name"], field["name"])

	def run_query(self, query):
	if not BIGQUERY_AVAILABLE:
	print >> sys.stderr, "bigquery_analytics: bigquery flow is not available. returning empty result."
	return []

	response = self.run_query_raw(query)

	fieldNames = []
	for field in response["schema"]["fields"]:
	fieldNames.append(field["name"])

	result = []
	if "rows" in response:
	for row in response["rows"]:
	this_result = {}
	for (i,column) in enumerate(row["f"]):
	this_result[reverse_mappings[self.tableName].get(fieldNames[i],fieldNames[i])] = column["v"]
	result.append(this_result)

	return result

	""" Filter_results, groupby_results, do_computed_fields, and postprocess_results
	are all used for postprocessing queries. The idea is to do one query that
	includes the ungrouped and unfiltered data, and cache it for multiple
	consumers who will filter and group it as necessary.

	TODO: Find a more generalized source for these sorts operations. Perhaps
	put the results in SQLite and then run SQL queries against it.
	"""

	def filter_results(self, rows, name, value):
	result = [row for row in rows if row.get(name)==value]
	return result

	def groupby_results(self, rows, groupBy=[], sum=[], count=[], avg=[], maxi=[]):
	new_rows = {}
	for row in rows:
	groupby_key = [row.get(k, None) for k in groupBy]

	if str(groupby_key) not in new_rows:
	new_row = {}
	for k in groupBy:
	new_row[k] = row.get(k, None)

	new_rows[str(groupby_key)] = new_row
	else:
	new_row = new_rows[str(groupby_key)]

	for k in sum:
	new_row["sum_" + k] = new_row.get("sum_" + k, 0) + to_number(row.get(k,0))

	for k in avg:
	new_row["avg_" + k] = new_row.get("avg_" + k, 0) + to_number(row.get(k,0))
	new_row["avg_base_" + k] = new_row.get("avg_base_"+k,0) + 1

	for k in maxi:
	new_row["max_" + k] = max(new_row.get("max_" + k, 0), to_number(row.get(k,0)))

	for k in count:
	v = row.get(k,None)
	dl = new_row["distinct_" + k] = new_row.get("distinct_" + k, [])
	if (v not in dl):
	dl.append(v)

	#new_row["count_" + k] = new_row.get("count_" + k, 0) + 1

	for row in new_rows.values():
	for k in avg:
	row["avg_" + k] = float(row["avg_" + k]) / row["avg_base_" + k]
	del row["avg_base_" + k]

	for k in count:
	new_row["count_" + k] = len(new_row.get("distinct_" + k, []))

	return new_rows.values()

	def do_computed_fields(self, rows, computed=[]):
	computedFieldNames=[]
	for row in rows:
	for k in computed:
	if "/" in k:
	parts = k.split("/")
	computedFieldName = "computed_" + parts[0].replace("%","")+"_div_"+parts[1].replace("%","")
	try:
	row[computedFieldName] = to_number(row[parts[0]]) / to_number(row[parts[1]])
	except:
	pass

	if computedFieldName not in computedFieldNames:
	computedFieldNames.append(computedFieldName)
	return (computedFieldNames, rows)

	def postprocess_results(self, rows, filter={}, groupBy=[], sum=[], count=[], avg=[], computed=[], maxi=[], maxDeltaTime=None):
	sum = [x.replace("%","") for x in sum]
	count = [x.replace("%","") for x in count]
	avg = [x.replace("%","") for x in avg]
	computed = [x.replace("%","") for x in computed]
	maxi = [x.replace("%","") for x in maxi]
	groupBy = [x.replace("%","") for x in groupBy]

	for (k,v) in filter.items():
	rows = self.filter_results(rows, k, v)

	if rows:
	if maxDeltaTime is not None:
	maxTime = max([float(row["time"]) for row in rows])
	rows = [row for row in rows if float(row["time"])>=maxTime-maxDeltaTime]

	(computedFieldNames, rows) = self.do_computed_fields(rows, computed)
	sum = sum + computedFieldNames
	if groupBy:
	rows = self.groupby_results(rows, groupBy, sum, count, avg, maxi)
	return rows

	def remap(self, match):
	if not self.tableName in mappings:
	raise MappingException("no mapping for table %s" % self.tableName)

	mapping = mappings[self.tableName]

	token = match.group()[1:]
	if token in mapping:
	return mapping[token]
	else:
	raise MappingException('unknown token %s' % token)

	def dump_table(self, rows, keys=None):
	if not keys:
	keys = rows[0].keys()

	lens = {}
	for key in keys:
	lens[key] = len(key)

	for row in rows:
	for key in keys:
	thislen = len(str(row.get(key,"")))
	lens[key] = max(lens.get(key,0), thislen)

	for key in keys:
	print "%*s" % (lens[key], key),
	print

	for row in rows:
	for key in keys:
	print "%*s" % (lens[key], str(row.get(key,""))),
	print

	def schema_to_cols(self, schema):
	fields = schema["fields"]

	colTypes = {"STRING": "string", "INTEGER": "number", "FLOAT": "number", "TIMESTAMP": "date"}

	cols = []
	i=0
	for field in fields:
	col = {"type": colTypes[field["type"]],
	"id": "Col%d" % i,
	"label": reverse_mappings[self.tableName].get(field["name"],field["name"])}
	cols.append(col)
	i=i+1

	return cols

	def main():
	bq = BigQueryAnalytics()

	rows = bq.run_query("select %hostname,SUM(%bytes_sent) from [vicci.demoevents] group by %hostname")

	bq.dump_table(rows)

	if __name__ == "__main__":
	main()