Blame - planetstack/util/xml.py - xos

blob: d3aefb1e379686509363834f3d8807523f174669 [file] [log] [blame]

S.Çağlar Onur	3e92b4d	2015-02-09 13:34:11 -0500	[diff] [blame]	1	#!/usr/bin/env python
Tony Mack	6393571	2013-03-26 13:23:12 -0400	[diff] [blame]	2	from types import StringTypes
				3	from lxml import etree
				4	from StringIO import StringIO
				5
				6	# helper functions to help build xpaths
				7	class XpathFilter:
				8	@staticmethod
				9
				10	def filter_value(key, value):
				11	xpath = ""
				12	if isinstance(value, str):
				13	if '*' in value:
				14	value = value.replace('*', '')
				15	xpath = 'contains(%s, "%s")' % (key, value)
				16	else:
				17	xpath = '%s="%s"' % (key, value)
				18	return xpath
				19
				20	@staticmethod
				21	def xpath(filter={}):
				22	xpath = ""
				23	if filter:
				24	filter_list = []
				25	for (key, value) in filter.items():
				26	if key == 'text':
				27	key = 'text()'
				28	else:
				29	key = '@'+key
				30	if isinstance(value, str):
				31	filter_list.append(XpathFilter.filter_value(key, value))
				32	elif isinstance(value, list):
				33	stmt = ' or '.join([XpathFilter.filter_value(key, str(val)) for val in value])
				34	filter_list.append(stmt)
				35	if filter_list:
				36	xpath = ' and '.join(filter_list)
				37	xpath = '[' + xpath + ']'
				38	return xpath
				39
				40	# a wrapper class around lxml.etree._Element
				41	# the reason why we need this one is because of the limitations
				42	# we've found in xpath to address documents with multiple namespaces defined
				43	# in a nutshell, we deal with xml documents that have
				44	# a default namespace defined (xmlns="http://default.com/") and specific prefixes defined
				45	# (xmlns:foo="http://foo.com")
				46	# according to the documentation instead of writing
				47	# element.xpath ( "//node/foo:subnode" )
				48	# we'd then need to write xpaths like
				49	# element.xpath ( "//{http://default.com/}node/{http://foo.com}subnode" )
				50	# which is a real pain..
				51	# So just so we can keep some reasonable programming style we need to manage the
				52	# namespace map that goes with the _Element (its internal .nsmap being unmutable)
				53	class XmlElement:
				54	def __init__(self, element, namespaces):
				55	self.element = element
				56	self.namespaces = namespaces
				57
				58	# redefine as few methods as possible
				59	def xpath(self, xpath, namespaces=None):
				60	if not namespaces:
				61	namespaces = self.namespaces
				62	elems = self.element.xpath(xpath, namespaces=namespaces)
				63	return [XmlElement(elem, namespaces) for elem in elems]
				64
				65	def add_element(self, tagname, **kwds):
				66	element = etree.SubElement(self.element, tagname, **kwds)
				67	return XmlElement(element, self.namespaces)
				68
				69	def append(self, elem):
				70	if isinstance(elem, XmlElement):
				71	self.element.append(elem.element)
				72	else:
				73	self.element.append(elem)
				74
				75	def getparent(self):
				76	return XmlElement(self.element.getparent(), self.namespaces)
				77
				78	def get_instance(self, instance_class=None, fields=[]):
				79	"""
				80	Returns an instance (dict) of this xml element. The instance
				81	holds a reference to this xml element.
				82	"""
				83	if not instance_class:
				84	instance_class = Object
				85	if not fields and hasattr(instance_class, 'fields'):
				86	fields = instance_class.fields
				87
				88	if not fields:
				89	instance = instance_class(self.attrib, self)
				90	else:
				91	instance = instance_class({}, self)
				92	for field in fields:
				93	if field in self.attrib:
				94	instance[field] = self.attrib[field]
				95	return instance
				96
				97	def add_instance(self, name, instance, fields=[]):
				98	"""
				99	Adds the specifed instance(s) as a child element of this xml
				100	element.
				101	"""
				102	if not fields and hasattr(instance, 'keys'):
				103	fields = instance.keys()
				104	elem = self.add_element(name)
				105	for field in fields:
				106	if field in instance and instance[field]:
				107	elem.set(field, unicode(instance[field]))
				108	return elem
				109
				110	def remove_elements(self, name):
				111	"""
				112	Removes all occurences of an element from the tree. Start at
				113	specified root_node if specified, otherwise start at tree's root.
				114	"""
				115
				116	if not element_name.startswith('//'):
				117	element_name = '//' + element_name
				118	elements = self.element.xpath('%s ' % name, namespaces=self.namespaces)
				119	for element in elements:
				120	parent = element.getparent()
				121	parent.remove(element)
				122
				123	def delete(self):
				124	parent = self.getparent()
				125	parent.remove(self)
				126
				127	def remove(self, element):
				128	if isinstance(element, XmlElement):
				129	self.element.remove(element.element)
				130	else:
				131	self.element.remove(element)
				132
				133	def set_text(self, text):
				134	self.element.text = text
				135
				136	# Element does not have unset ?!?
				137	def unset(self, key):
				138	del self.element.attrib[key]
				139
				140	def toxml(self):
				141	return etree.tostring(self.element, encoding='UTF-8', pretty_print=True)
				142
				143	def __str__(self):
				144	return self.toxml()
				145
				146	# are redirected on self.element
				147	def __getattr__ (self, name):
				148	if not hasattr(self.element, name):
				149	raise AttributeError, name
				150	return getattr(self.element, name)
				151
				152	class Xml:
				153
				154	def __init__(self, xml=None, namespaces=None):
				155	self.root = None
				156	self.namespaces = namespaces
				157	self.default_namespace = None
				158	self.schema = None
				159	if isinstance(xml, basestring):
				160	self.parse_xml(xml)
				161	if isinstance(xml, XmlElement):
				162	self.root = xml
				163	self.namespaces = xml.namespaces
				164	elif isinstance(xml, etree._ElementTree) or isinstance(xml, etree._Element):
				165	self.parse_xml(etree.tostring(xml))
				166
				167	def parse_xml(self, xml):
				168	"""
				169	parse rspec into etree
				170	"""
				171	parser = etree.XMLParser(remove_blank_text=True)
				172	try:
				173	tree = etree.parse(xml, parser)
				174	except IOError:
				175	# 'rspec' file doesnt exist. 'rspec' is proably an xml string
				176	try:
				177	tree = etree.parse(StringIO(xml), parser)
				178	except Exception, e:
				179	raise Exception, str(e)
				180	root = tree.getroot()
				181	self.namespaces = dict(root.nsmap)
				182	# set namespaces map
				183	if 'default' not in self.namespaces and None in self.namespaces:
				184	# If the 'None' exist, then it's pointing to the default namespace. This makes
				185	# it hard for us to write xpath queries for the default naemspace because lxml
				186	# wont understand a None prefix. We will just associate the default namespeace
				187	# with a key named 'default'.
				188	self.namespaces['default'] = self.namespaces.pop(None)
				189
				190	else:
				191	self.namespaces['default'] = 'default'
				192
				193	self.root = XmlElement(root, self.namespaces)
				194	# set schema
				195	for key in self.root.attrib.keys():
				196	if key.endswith('schemaLocation'):
				197	# schemaLocation should be at the end of the list.
				198	# Use list comprehension to filter out empty strings
				199	schema_parts = [x for x in self.root.attrib[key].split(' ') if x]
				200	self.schema = schema_parts[1]
				201	namespace, schema = schema_parts[0], schema_parts[1]
				202	break
				203
				204	def parse_dict(self, d, root_tag_name='xml', element = None):
				205	if element is None:
				206	if self.root is None:
				207	self.parse_xml('<%s/>' % root_tag_name)
				208	element = self.root.element
				209
				210	if 'text' in d:
				211	text = d.pop('text')
				212	element.text = text
				213
				214	# handle repeating fields
				215	for (key, value) in d.items():
				216	if isinstance(value, list):
				217	value = d.pop(key)
				218	for val in value:
				219	if isinstance(val, dict):
				220	child_element = etree.SubElement(element, key)
				221	self.parse_dict(val, key, child_element)
				222	elif isinstance(val, basestring):
				223	child_element = etree.SubElement(element, key).text = val
				224
				225	elif isinstance(value, int):
				226	d[key] = unicode(d[key])
				227	elif value is None:
				228	d.pop(key)
				229
				230	# element.attrib.update will explode if DateTimes are in the
				231	# dcitionary.
				232	d=d.copy()
				233	# looks like iteritems won't stand side-effects
				234	for k in d.keys():
				235	if not isinstance(d[k],StringTypes):
				236	del d[k]
				237
				238	element.attrib.update(d)
				239
				240	def validate(self, schema):
				241	"""
				242	Validate against rng schema
				243	"""
				244	relaxng_doc = etree.parse(schema)
				245	relaxng = etree.RelaxNG(relaxng_doc)
				246	if not relaxng(self.root):
				247	error = relaxng.error_log.last_error
				248	message = "%s (line %s)" % (error.message, error.line)
				249	raise Exception, message
				250	return True
				251
				252	def xpath(self, xpath, namespaces=None):
				253	if not namespaces:
				254	namespaces = self.namespaces
				255	return self.root.xpath(xpath, namespaces=namespaces)
				256
				257	def set(self, key, value):
				258	return self.root.set(key, value)
				259
				260	def remove_attribute(self, name, element=None):
				261	if not element:
				262	element = self.root
				263	element.remove_attribute(name)
				264
				265	def add_element(self, args, *kwds):
				266	"""
				267	Wrapper around etree.SubElement(). Adds an element to
				268	specified parent node. Adds element to root node is parent is
				269	not specified.
				270	"""
				271	return self.root.add_element(args, *kwds)
				272
				273	def remove_elements(self, name, element = None):
				274	"""
				275	Removes all occurences of an element from the tree. Start at
				276	specified root_node if specified, otherwise start at tree's root.
				277	"""
				278	if not element:
				279	element = self.root
				280
				281	element.remove_elements(name)
				282
				283	def add_instance(self, args, *kwds):
				284	return self.root.add_instance(args, *kwds)
				285
				286	def get_instance(self, args, *kwds):
				287	return self.root.get_instnace(args, *kwds)
				288
				289	def get_element_attributes(self, elem=None, depth=0):
				290	if elem == None:
				291	elem = self.root
				292	if not hasattr(elem, 'attrib'):
				293	# this is probably not an element node with attribute. could be just and an
				294	# attribute, return it
				295	return elem
				296	attrs = dict(elem.attrib)
				297	attrs['text'] = str(elem.text).strip()
				298	attrs['parent'] = elem.getparent()
				299	if isinstance(depth, int) and depth > 0:
				300	for child_elem in list(elem):
				301	key = str(child_elem.tag)
				302	if key not in attrs:
				303	attrs[key] = [self.get_element_attributes(child_elem, depth-1)]
				304	else:
				305	attrs[key].append(self.get_element_attributes(child_elem, depth-1))
				306	else:
				307	attrs['child_nodes'] = list(elem)
				308	return attrs
				309
				310	def append(self, elem):
				311	return self.root.append(elem)
				312
				313	def iterchildren(self):
				314	return self.root.iterchildren()
				315
				316	def merge(self, in_xml):
				317	pass
				318
				319	def __str__(self):
				320	return self.toxml()
				321
				322	def toxml(self):
				323	return etree.tostring(self.root.element, encoding='UTF-8', pretty_print=True)
				324
				325	# XXX smbaker, for record.load_from_string
				326	def todict(self, elem=None):
				327	if elem is None:
				328	elem = self.root
				329	d = {}
				330	d.update(elem.attrib)
				331	d['text'] = elem.text
				332	for child in elem.iterchildren():
				333	if child.tag not in d:
				334	d[child.tag] = []
				335	d[child.tag].append(self.todict(child))
				336
				337	if len(d)==1 and ("text" in d):
				338	d = d["text"]
				339
				340	return d
				341
				342	def save(self, filename):
				343	f = open(filename, 'w')
				344	f.write(self.toxml())
				345	f.close()
				346
				347