1 | # -*- coding: utf-8 -*- |
---|
2 | # |
---|
3 | # Copyright (C) 2005-2007 Christopher Lenz <cmlenz@gmx.de> |
---|
4 | # Copyright (C) 2007-2010 Edgewall Software |
---|
5 | # All rights reserved. |
---|
6 | # |
---|
7 | # This software is licensed as described in the file COPYING, which |
---|
8 | # you should have received as part of this distribution. The terms |
---|
9 | # are also available at http://bitten.edgewall.org/wiki/License. |
---|
10 | |
---|
11 | """Utility code for easy input and output of XML. |
---|
12 | |
---|
13 | The current implementation uses ``xml.dom.minidom`` under the hood for parsing. |
---|
14 | """ |
---|
15 | |
---|
16 | import os |
---|
17 | try: |
---|
18 | from cStringIO import StringIO |
---|
19 | except ImportError: |
---|
20 | from StringIO import StringIO |
---|
21 | from UserDict import DictMixin |
---|
22 | |
---|
23 | import cgi |
---|
24 | import string |
---|
25 | |
---|
26 | __all__ = ['Fragment', 'Element', 'ParsedElement', 'parse'] |
---|
27 | __docformat__ = 'restructuredtext en' |
---|
28 | |
---|
29 | def _from_utf8(text): |
---|
30 | """Convert utf-8 string to unicode. All other input returned as-is.""" |
---|
31 | if isinstance(text, str): |
---|
32 | return text.decode('utf-8') |
---|
33 | else: |
---|
34 | return text |
---|
35 | |
---|
36 | def _to_utf8(text): |
---|
37 | """Convert any input to utf-8 byte string.""" |
---|
38 | if isinstance(text, str): |
---|
39 | return text # presumes utf-8 |
---|
40 | elif not isinstance(text, unicode): |
---|
41 | text = unicode(text) |
---|
42 | return text.encode('utf-8') |
---|
43 | |
---|
44 | __trans = string.maketrans('', '') |
---|
45 | # http://www.w3.org/TR/xml11/#charsets (partial) |
---|
46 | __todel = ('\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x12\x13\x14' |
---|
47 | '\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\x80\x81\x82\x83' |
---|
48 | '\x84\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94' |
---|
49 | '\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f') |
---|
50 | __uni_trans = dict([(ord(c), None) for c in __todel]) |
---|
51 | |
---|
52 | def _escape_text(text): |
---|
53 | """Escape special characters in the provided text so that it can be safely |
---|
54 | included in XML text nodes. |
---|
55 | """ |
---|
56 | if isinstance(text, str): |
---|
57 | text = cgi.escape(text.translate(__trans, __todel)) |
---|
58 | elif isinstance(text, unicode): |
---|
59 | text = cgi.escape(text.translate(__uni_trans)) |
---|
60 | return text |
---|
61 | |
---|
62 | def _escape_attr(attr): |
---|
63 | """Escape special characters in the provided text so that it can be safely |
---|
64 | included in XML attribute values. |
---|
65 | """ |
---|
66 | if isinstance(attr, basestring): |
---|
67 | return _escape_text(attr).replace('"', '"') |
---|
68 | else: |
---|
69 | return attr |
---|
70 | |
---|
71 | |
---|
72 | class Fragment(object): |
---|
73 | """A collection of XML elements.""" |
---|
74 | __slots__ = ['children'] |
---|
75 | |
---|
76 | def __init__(self): |
---|
77 | """Create an XML fragment.""" |
---|
78 | self.children = [] |
---|
79 | |
---|
80 | def __getitem__(self, nodes): |
---|
81 | """Add nodes to the fragment.""" |
---|
82 | if not isinstance(nodes, (list, tuple)): |
---|
83 | nodes = [nodes] |
---|
84 | for node in nodes: |
---|
85 | self.append(node) |
---|
86 | return self |
---|
87 | |
---|
88 | def __str__(self): |
---|
89 | """Return a string representation of the XML fragment.""" |
---|
90 | buf = StringIO() |
---|
91 | self.write(buf) |
---|
92 | return buf.getvalue() |
---|
93 | |
---|
94 | def append(self, node): |
---|
95 | """Append an element or fragment as child.""" |
---|
96 | if isinstance(node, Element): |
---|
97 | self.children.append(node) |
---|
98 | elif isinstance(node, Fragment): |
---|
99 | self.children += node.children |
---|
100 | elif node is not None and node != '': |
---|
101 | if isinstance(node, basestring): |
---|
102 | self.children.append(_from_utf8(node)) |
---|
103 | else: |
---|
104 | self.children.append(unicode(node)) |
---|
105 | |
---|
106 | def write(self, out, newlines=False): |
---|
107 | """Serializes the element and writes the XML to the given output |
---|
108 | stream. |
---|
109 | """ |
---|
110 | for child in self.children: |
---|
111 | if isinstance(child, (Element, ParsedElement)): |
---|
112 | child.write(out, newlines=newlines) |
---|
113 | else: |
---|
114 | if child.startswith('<'): |
---|
115 | out.write('<![CDATA[' + _to_utf8(child) + ']]>') |
---|
116 | else: |
---|
117 | out.write(_to_utf8(_escape_text(child))) |
---|
118 | |
---|
119 | |
---|
120 | class Element(Fragment): |
---|
121 | """Simple XML output generator based on the builder pattern. |
---|
122 | |
---|
123 | Construct XML elements by passing the tag name to the constructor: |
---|
124 | |
---|
125 | >>> print Element('foo') |
---|
126 | <foo/> |
---|
127 | |
---|
128 | Attributes can be specified using keyword arguments. The values of the |
---|
129 | arguments will be converted to strings and any special XML characters |
---|
130 | escaped: |
---|
131 | |
---|
132 | >>> print Element('foo', bar=42) |
---|
133 | <foo bar="42"/> |
---|
134 | >>> print Element('foo', bar='1 < 2') |
---|
135 | <foo bar="1 < 2"/> |
---|
136 | >>> print Element('foo', bar='"baz"') |
---|
137 | <foo bar=""baz""/> |
---|
138 | |
---|
139 | The order in which attributes are rendered is undefined. |
---|
140 | |
---|
141 | Elements can be using item access notation: |
---|
142 | |
---|
143 | >>> print Element('foo')[Element('bar'), Element('baz')] |
---|
144 | <foo><bar/><baz/></foo> |
---|
145 | |
---|
146 | Text nodes can be nested in an element by using strings instead of elements |
---|
147 | in item access. Any special characters in the strings are escaped |
---|
148 | automatically: |
---|
149 | |
---|
150 | >>> print Element('foo')['Hello world'] |
---|
151 | <foo>Hello world</foo> |
---|
152 | >>> print Element('foo')[42] |
---|
153 | <foo>42</foo> |
---|
154 | >>> print Element('foo')['1 < 2'] |
---|
155 | <foo>1 < 2</foo> |
---|
156 | |
---|
157 | This technique also allows mixed content: |
---|
158 | |
---|
159 | >>> print Element('foo')['Hello ', Element('b')['world']] |
---|
160 | <foo>Hello <b>world</b></foo> |
---|
161 | |
---|
162 | Finally, text starting with an opening angle bracket is treated specially: |
---|
163 | under the assumption that the text actually contains XML itself, the whole |
---|
164 | thing is wrapped in a CDATA block instead of escaping all special characters |
---|
165 | individually: |
---|
166 | |
---|
167 | >>> print Element('foo')['<bar a="3" b="4"><baz/></bar>'] |
---|
168 | <foo><![CDATA[<bar a="3" b="4"><baz/></bar>]]></foo> |
---|
169 | |
---|
170 | Valid input are utf-8 or unicode strings, or any type easily converted |
---|
171 | to unicode such as integers. Output is always utf-8. |
---|
172 | """ |
---|
173 | __slots__ = ['name', 'attr'] |
---|
174 | |
---|
175 | def __init__(self, name_, **attr): |
---|
176 | """Create an XML element using the specified tag name. |
---|
177 | |
---|
178 | The tag name must be supplied as the first positional argument. All |
---|
179 | keyword arguments following it are handled as attributes of the element. |
---|
180 | """ |
---|
181 | Fragment.__init__(self) |
---|
182 | self.name = _from_utf8(name_) |
---|
183 | self.attr = dict([(_from_utf8(name), _from_utf8(value)) \ |
---|
184 | for name, value in attr.items() \ |
---|
185 | if value is not None]) |
---|
186 | |
---|
187 | def write(self, out, newlines=False): |
---|
188 | """Serializes the element and writes the XML to the given output |
---|
189 | stream. |
---|
190 | """ |
---|
191 | out.write('<') |
---|
192 | out.write(_to_utf8(self.name)) |
---|
193 | for name, value in self.attr.items(): |
---|
194 | out.write(_to_utf8(' %s="%s"' % (name, _escape_attr(value)))) |
---|
195 | if self.children: |
---|
196 | out.write('>') |
---|
197 | Fragment.write(self, out, newlines) |
---|
198 | out.write('</' + _to_utf8(self.name) + '>') |
---|
199 | else: |
---|
200 | out.write('/>') |
---|
201 | if newlines: |
---|
202 | out.write(os.linesep) |
---|
203 | |
---|
204 | |
---|
205 | class ParseError(Exception): |
---|
206 | """Exception thrown when there's an error parsing an XML document.""" |
---|
207 | |
---|
208 | |
---|
209 | def parse(text_or_file): |
---|
210 | """Parse an XML document provided as string or file-like object. |
---|
211 | |
---|
212 | Returns an instance of `ParsedElement` that can be used to traverse the |
---|
213 | parsed document. |
---|
214 | """ |
---|
215 | from xml.dom import minidom |
---|
216 | from xml.parsers import expat |
---|
217 | try: |
---|
218 | if isinstance(text_or_file, basestring): |
---|
219 | dom = minidom.parseString(_to_utf8(text_or_file)) |
---|
220 | else: |
---|
221 | dom = minidom.parse(text_or_file) |
---|
222 | return ParsedElement(dom.documentElement) |
---|
223 | except expat.error, e: |
---|
224 | raise ParseError(e) |
---|
225 | |
---|
226 | |
---|
227 | class ParsedElement(object): |
---|
228 | """Representation of an XML element that was parsed from a string or |
---|
229 | file. |
---|
230 | |
---|
231 | This class should not be used directly. Rather, XML text parsed using |
---|
232 | `xmlio.parse()` will return an instance of this class. |
---|
233 | |
---|
234 | >>> xml = parse('<root/>') |
---|
235 | >>> print xml.name |
---|
236 | root |
---|
237 | |
---|
238 | Parsed elements can be serialized to a string using the `write()` method: |
---|
239 | |
---|
240 | >>> import sys |
---|
241 | >>> parse('<root></root>').write(sys.stdout) |
---|
242 | <root/> |
---|
243 | |
---|
244 | For convenience, this is also done when coercing the object to a string |
---|
245 | using the builtin ``str()`` function, which is used when printing an |
---|
246 | object: |
---|
247 | |
---|
248 | >>> print parse('<root></root>') |
---|
249 | <root/> |
---|
250 | |
---|
251 | (Note that serializing the element will produce a normalized representation |
---|
252 | that may not excatly match the input string.) |
---|
253 | |
---|
254 | Attributes are accessed via the `attr` member: |
---|
255 | |
---|
256 | >>> print parse('<root foo="bar"/>').attr['foo'] |
---|
257 | bar |
---|
258 | |
---|
259 | Attributes can also be updated, added or removed: |
---|
260 | |
---|
261 | >>> xml = parse('<root foo="bar"/>') |
---|
262 | >>> xml.attr['foo'] = 'baz' |
---|
263 | >>> print xml |
---|
264 | <root foo="baz"/> |
---|
265 | |
---|
266 | >>> del xml.attr['foo'] |
---|
267 | >>> print xml |
---|
268 | <root/> |
---|
269 | |
---|
270 | >>> xml.attr['foo'] = 'bar' |
---|
271 | >>> print xml |
---|
272 | <root foo="bar"/> |
---|
273 | |
---|
274 | CDATA sections are included in the text content of the element returned by |
---|
275 | `gettext()`: |
---|
276 | |
---|
277 | >>> xml = parse('<root>foo<![CDATA[ <bar> ]]>baz</root>') |
---|
278 | >>> xml.gettext() |
---|
279 | 'foo <bar> baz' |
---|
280 | |
---|
281 | Valid input are utf-8 or unicode strings, or any type easily converted |
---|
282 | to unicode such as integers. Output is always utf-8. |
---|
283 | """ |
---|
284 | __slots__ = ['_node', 'attr'] |
---|
285 | |
---|
286 | class _Attrs(DictMixin): |
---|
287 | """Simple wrapper around the element attributes to provide a dictionary |
---|
288 | interface.""" |
---|
289 | def __init__(self, node): |
---|
290 | self._node = node |
---|
291 | def __getitem__(self, name): |
---|
292 | attr = self._node.getAttributeNode(name) |
---|
293 | if not attr: |
---|
294 | raise KeyError(name) |
---|
295 | return _to_utf8(attr.value) |
---|
296 | def __setitem__(self, name, value): |
---|
297 | self._node.setAttribute(name, value) |
---|
298 | def __delitem__(self, name): |
---|
299 | self._node.removeAttribute(name) |
---|
300 | def keys(self): |
---|
301 | return [_to_utf8(key) for key in self._node.attributes.keys()] |
---|
302 | |
---|
303 | def __init__(self, node): |
---|
304 | self._node = node |
---|
305 | self.attr = ParsedElement._Attrs(node) |
---|
306 | |
---|
307 | name = property(fget=lambda self: self._node.localName, |
---|
308 | doc='Local name of the element') |
---|
309 | namespace = property(fget=lambda self: self._node.namespaceURI, |
---|
310 | doc='Namespace URI of the element') |
---|
311 | |
---|
312 | def children(self, name=None): |
---|
313 | """Iterate over the child elements of this element. |
---|
314 | |
---|
315 | If the parameter `name` is provided, only include elements with a |
---|
316 | matching local name. Otherwise, include all elements. |
---|
317 | """ |
---|
318 | for child in [c for c in self._node.childNodes if c.nodeType == 1]: |
---|
319 | if name in (None, child.tagName): |
---|
320 | yield ParsedElement(child) |
---|
321 | |
---|
322 | def __iter__(self): |
---|
323 | return self.children() |
---|
324 | |
---|
325 | def gettext(self): |
---|
326 | """Return the text content of this element. |
---|
327 | |
---|
328 | This concatenates the values of all text and CDATA nodes that are |
---|
329 | immediate children of this element. |
---|
330 | """ |
---|
331 | return ''.join([_to_utf8(c.nodeValue) |
---|
332 | for c in self._node.childNodes |
---|
333 | if c.nodeType in (3, 4)]) |
---|
334 | |
---|
335 | def write(self, out, newlines=False): |
---|
336 | """Serializes the element and writes the XML to the given output |
---|
337 | stream. |
---|
338 | """ |
---|
339 | out.write(self._node.toprettyxml(newl=newlines and '\n' or '', |
---|
340 | indent=newlines and '\t' or '', encoding='utf-8')) |
---|
341 | |
---|
342 | def __str__(self): |
---|
343 | """Return a string representation of the XML element.""" |
---|
344 | buf = StringIO() |
---|
345 | self.write(buf) |
---|
346 | return buf.getvalue() |
---|
347 | |
---|
348 | |
---|
349 | if __name__ == '__main__': |
---|
350 | import doctest |
---|
351 | doctest.testmod() |
---|