| 1 | # -*- coding: utf-8 -*- |
---|
| 2 | # |
---|
| 3 | # Copyright (C) 2005-2007 Christopher Lenz <cmlenz@gmx.de> |
---|
| 4 | # Copyright (C) 2007-2010 Edgewall Software |
---|
| 5 | # All rights reserved. |
---|
| 6 | # |
---|
| 7 | # This software is licensed as described in the file COPYING, which |
---|
| 8 | # you should have received as part of this distribution. The terms |
---|
| 9 | # are also available at http://bitten.edgewall.org/wiki/License. |
---|
| 10 | |
---|
| 11 | """Utility code for easy input and output of XML. |
---|
| 12 | |
---|
| 13 | The current implementation uses ``xml.dom.minidom`` under the hood for parsing. |
---|
| 14 | """ |
---|
| 15 | |
---|
0 | 16 | import os |
---|
0 | 17 | try: |
---|
0 | 18 | from cStringIO import StringIO |
---|
0 | 19 | except ImportError: |
---|
0 | 20 | from StringIO import StringIO |
---|
0 | 21 | from UserDict import DictMixin |
---|
| 22 | |
---|
0 | 23 | import cgi |
---|
0 | 24 | import string |
---|
| 25 | |
---|
0 | 26 | __all__ = ['Fragment', 'Element', 'ParsedElement', 'parse'] |
---|
0 | 27 | __docformat__ = 'restructuredtext en' |
---|
| 28 | |
---|
0 | 29 | def _from_utf8(text): |
---|
| 30 | """Convert utf-8 string to unicode. All other input returned as-is.""" |
---|
3351 | 31 | if isinstance(text, str): |
---|
3017 | 32 | return text.decode('utf-8') |
---|
3017 | 33 | else: |
---|
334 | 34 | return text |
---|
| 35 | |
---|
0 | 36 | def _to_utf8(text): |
---|
| 37 | """Convert any input to utf-8 byte string.""" |
---|
2047 | 38 | if isinstance(text, str): |
---|
42 | 39 | return text # presumes utf-8 |
---|
2005 | 40 | elif not isinstance(text, unicode): |
---|
0 | 41 | text = unicode(text) |
---|
2005 | 42 | return text.encode('utf-8') |
---|
| 43 | |
---|
0 | 44 | __trans = string.maketrans('', '') |
---|
| 45 | # http://www.w3.org/TR/xml11/#charsets (partial) |
---|
0 | 46 | __todel = ('\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x12\x13\x14' |
---|
0 | 47 | '\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\x80\x81\x82\x83' |
---|
0 | 48 | '\x84\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94' |
---|
0 | 49 | '\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f') |
---|
0 | 50 | __uni_trans = dict([(ord(c), None) for c in __todel]) |
---|
| 51 | |
---|
0 | 52 | def _escape_text(text): |
---|
| 53 | """Escape special characters in the provided text so that it can be safely |
---|
| 54 | included in XML text nodes. |
---|
| 55 | """ |
---|
1141 | 56 | if isinstance(text, str): |
---|
7 | 57 | text = cgi.escape(text.translate(__trans, __todel)) |
---|
1134 | 58 | elif isinstance(text, unicode): |
---|
1132 | 59 | text = cgi.escape(text.translate(__uni_trans)) |
---|
1141 | 60 | return text |
---|
| 61 | |
---|
0 | 62 | def _escape_attr(attr): |
---|
| 63 | """Escape special characters in the provided text so that it can be safely |
---|
| 64 | included in XML attribute values. |
---|
| 65 | """ |
---|
1400 | 66 | if isinstance(attr, basestring): |
---|
1122 | 67 | return _escape_text(attr).replace('"', '"') |
---|
1122 | 68 | else: |
---|
278 | 69 | return attr |
---|
| 70 | |
---|
| 71 | |
---|
0 | 72 | class Fragment(object): |
---|
| 73 | """A collection of XML elements.""" |
---|
0 | 74 | __slots__ = ['children'] |
---|
| 75 | |
---|
0 | 76 | def __init__(self): |
---|
| 77 | """Create an XML fragment.""" |
---|
399 | 78 | self.children = [] |
---|
| 79 | |
---|
0 | 80 | def __getitem__(self, nodes): |
---|
| 81 | """Add nodes to the fragment.""" |
---|
38 | 82 | if not isinstance(nodes, (list, tuple)): |
---|
36 | 83 | nodes = [nodes] |
---|
78 | 84 | for node in nodes: |
---|
40 | 85 | self.append(node) |
---|
38 | 86 | return self |
---|
| 87 | |
---|
0 | 88 | def __str__(self): |
---|
| 89 | """Return a string representation of the XML fragment.""" |
---|
15 | 90 | buf = StringIO() |
---|
15 | 91 | self.write(buf) |
---|
15 | 92 | return buf.getvalue() |
---|
| 93 | |
---|
0 | 94 | def append(self, node): |
---|
| 95 | """Append an element or fragment as child.""" |
---|
376 | 96 | if isinstance(node, Element): |
---|
344 | 97 | self.children.append(node) |
---|
32 | 98 | elif isinstance(node, Fragment): |
---|
2 | 99 | self.children += node.children |
---|
30 | 100 | elif node is not None and node != '': |
---|
28 | 101 | if isinstance(node, basestring): |
---|
27 | 102 | self.children.append(_from_utf8(node)) |
---|
27 | 103 | else: |
---|
1 | 104 | self.children.append(unicode(node)) |
---|
| 105 | |
---|
0 | 106 | def write(self, out, newlines=False): |
---|
| 107 | """Serializes the element and writes the XML to the given output |
---|
| 108 | stream. |
---|
| 109 | """ |
---|
336 | 110 | for child in self.children: |
---|
306 | 111 | if isinstance(child, (Element, ParsedElement)): |
---|
290 | 112 | child.write(out, newlines=newlines) |
---|
290 | 113 | else: |
---|
16 | 114 | if child.startswith('<'): |
---|
1 | 115 | out.write('<![CDATA[' + _to_utf8(child) + ']]>') |
---|
1 | 116 | else: |
---|
15 | 117 | out.write(_to_utf8(_escape_text(child))) |
---|
| 118 | |
---|
| 119 | |
---|
0 | 120 | class Element(Fragment): |
---|
| 121 | """Simple XML output generator based on the builder pattern. |
---|
| 122 | |
---|
| 123 | Construct XML elements by passing the tag name to the constructor: |
---|
| 124 | |
---|
| 125 | >>> print Element('foo') |
---|
| 126 | <foo/> |
---|
| 127 | |
---|
| 128 | Attributes can be specified using keyword arguments. The values of the |
---|
| 129 | arguments will be converted to strings and any special XML characters |
---|
| 130 | escaped: |
---|
| 131 | |
---|
| 132 | >>> print Element('foo', bar=42) |
---|
| 133 | <foo bar="42"/> |
---|
| 134 | >>> print Element('foo', bar='1 < 2') |
---|
| 135 | <foo bar="1 < 2"/> |
---|
| 136 | >>> print Element('foo', bar='"baz"') |
---|
| 137 | <foo bar=""baz""/> |
---|
| 138 | |
---|
| 139 | The order in which attributes are rendered is undefined. |
---|
| 140 | |
---|
| 141 | Elements can be using item access notation: |
---|
| 142 | |
---|
| 143 | >>> print Element('foo')[Element('bar'), Element('baz')] |
---|
| 144 | <foo><bar/><baz/></foo> |
---|
| 145 | |
---|
| 146 | Text nodes can be nested in an element by using strings instead of elements |
---|
| 147 | in item access. Any special characters in the strings are escaped |
---|
| 148 | automatically: |
---|
| 149 | |
---|
| 150 | >>> print Element('foo')['Hello world'] |
---|
| 151 | <foo>Hello world</foo> |
---|
| 152 | >>> print Element('foo')[42] |
---|
| 153 | <foo>42</foo> |
---|
| 154 | >>> print Element('foo')['1 < 2'] |
---|
| 155 | <foo>1 < 2</foo> |
---|
| 156 | |
---|
| 157 | This technique also allows mixed content: |
---|
| 158 | |
---|
| 159 | >>> print Element('foo')['Hello ', Element('b')['world']] |
---|
| 160 | <foo>Hello <b>world</b></foo> |
---|
| 161 | |
---|
| 162 | Finally, text starting with an opening angle bracket is treated specially: |
---|
| 163 | under the assumption that the text actually contains XML itself, the whole |
---|
| 164 | thing is wrapped in a CDATA block instead of escaping all special characters |
---|
| 165 | individually: |
---|
| 166 | |
---|
| 167 | >>> print Element('foo')['<bar a="3" b="4"><baz/></bar>'] |
---|
| 168 | <foo><![CDATA[<bar a="3" b="4"><baz/></bar>]]></foo> |
---|
| 169 | |
---|
| 170 | Valid input are utf-8 or unicode strings, or any type easily converted |
---|
| 171 | to unicode such as integers. Output is always utf-8. |
---|
| 172 | """ |
---|
0 | 173 | __slots__ = ['name', 'attr'] |
---|
| 174 | |
---|
0 | 175 | def __init__(self, name_, **attr): |
---|
| 176 | """Create an XML element using the specified tag name. |
---|
| 177 | |
---|
| 178 | The tag name must be supplied as the first positional argument. All |
---|
| 179 | keyword arguments following it are handled as attributes of the element. |
---|
| 180 | """ |
---|
360 | 181 | Fragment.__init__(self) |
---|
360 | 182 | self.name = _from_utf8(name_) |
---|
360 | 183 | self.attr = dict([(_from_utf8(name), _from_utf8(value)) \ |
---|
360 | 184 | for name, value in attr.items() \ |
---|
1494 | 185 | if value is not None]) |
---|
| 186 | |
---|
0 | 187 | def write(self, out, newlines=False): |
---|
| 188 | """Serializes the element and writes the XML to the given output |
---|
| 189 | stream. |
---|
| 190 | """ |
---|
306 | 191 | out.write('<') |
---|
306 | 192 | out.write(_to_utf8(self.name)) |
---|
1701 | 193 | for name, value in self.attr.items(): |
---|
1395 | 194 | out.write(_to_utf8(' %s="%s"' % (name, _escape_attr(value)))) |
---|
306 | 195 | if self.children: |
---|
30 | 196 | out.write('>') |
---|
30 | 197 | Fragment.write(self, out, newlines) |
---|
30 | 198 | out.write('</' + _to_utf8(self.name) + '>') |
---|
30 | 199 | else: |
---|
276 | 200 | out.write('/>') |
---|
306 | 201 | if newlines: |
---|
284 | 202 | out.write(os.linesep) |
---|
| 203 | |
---|
| 204 | |
---|
0 | 205 | class ParseError(Exception): |
---|
| 206 | """Exception thrown when there's an error parsing an XML document.""" |
---|
| 207 | |
---|
| 208 | |
---|
0 | 209 | def parse(text_or_file): |
---|
| 210 | """Parse an XML document provided as string or file-like object. |
---|
| 211 | |
---|
| 212 | Returns an instance of `ParsedElement` that can be used to traverse the |
---|
| 213 | parsed document. |
---|
| 214 | """ |
---|
69 | 215 | from xml.dom import minidom |
---|
69 | 216 | from xml.parsers import expat |
---|
69 | 217 | try: |
---|
69 | 218 | if isinstance(text_or_file, basestring): |
---|
48 | 219 | dom = minidom.parseString(_to_utf8(text_or_file)) |
---|
48 | 220 | else: |
---|
21 | 221 | dom = minidom.parse(text_or_file) |
---|
66 | 222 | return ParsedElement(dom.documentElement) |
---|
3 | 223 | except expat.error, e: |
---|
3 | 224 | raise ParseError(e) |
---|
| 225 | |
---|
| 226 | |
---|
0 | 227 | class ParsedElement(object): |
---|
| 228 | """Representation of an XML element that was parsed from a string or |
---|
| 229 | file. |
---|
| 230 | |
---|
| 231 | This class should not be used directly. Rather, XML text parsed using |
---|
| 232 | `xmlio.parse()` will return an instance of this class. |
---|
| 233 | |
---|
| 234 | >>> xml = parse('<root/>') |
---|
| 235 | >>> print xml.name |
---|
| 236 | root |
---|
| 237 | |
---|
| 238 | Parsed elements can be serialized to a string using the `write()` method: |
---|
| 239 | |
---|
| 240 | >>> import sys |
---|
| 241 | >>> parse('<root></root>').write(sys.stdout) |
---|
| 242 | <root/> |
---|
| 243 | |
---|
| 244 | For convenience, this is also done when coercing the object to a string |
---|
| 245 | using the builtin ``str()`` function, which is used when printing an |
---|
| 246 | object: |
---|
| 247 | |
---|
| 248 | >>> print parse('<root></root>') |
---|
| 249 | <root/> |
---|
| 250 | |
---|
| 251 | (Note that serializing the element will produce a normalized representation |
---|
| 252 | that may not excatly match the input string.) |
---|
| 253 | |
---|
| 254 | Attributes are accessed via the `attr` member: |
---|
| 255 | |
---|
| 256 | >>> print parse('<root foo="bar"/>').attr['foo'] |
---|
| 257 | bar |
---|
| 258 | |
---|
| 259 | Attributes can also be updated, added or removed: |
---|
| 260 | |
---|
| 261 | >>> xml = parse('<root foo="bar"/>') |
---|
| 262 | >>> xml.attr['foo'] = 'baz' |
---|
| 263 | >>> print xml |
---|
| 264 | <root foo="baz"/> |
---|
| 265 | |
---|
| 266 | >>> del xml.attr['foo'] |
---|
| 267 | >>> print xml |
---|
| 268 | <root/> |
---|
| 269 | |
---|
| 270 | >>> xml.attr['foo'] = 'bar' |
---|
| 271 | >>> print xml |
---|
| 272 | <root foo="bar"/> |
---|
| 273 | |
---|
| 274 | CDATA sections are included in the text content of the element returned by |
---|
| 275 | `gettext()`: |
---|
| 276 | |
---|
| 277 | >>> xml = parse('<root>foo<![CDATA[ <bar> ]]>baz</root>') |
---|
| 278 | >>> xml.gettext() |
---|
| 279 | 'foo <bar> baz' |
---|
| 280 | |
---|
| 281 | Valid input are utf-8 or unicode strings, or any type easily converted |
---|
| 282 | to unicode such as integers. Output is always utf-8. |
---|
| 283 | """ |
---|
0 | 284 | __slots__ = ['_node', 'attr'] |
---|
| 285 | |
---|
0 | 286 | class _Attrs(DictMixin): |
---|
| 287 | """Simple wrapper around the element attributes to provide a dictionary |
---|
| 288 | interface.""" |
---|
0 | 289 | def __init__(self, node): |
---|
208 | 290 | self._node = node |
---|
0 | 291 | def __getitem__(self, name): |
---|
242 | 292 | attr = self._node.getAttributeNode(name) |
---|
242 | 293 | if not attr: |
---|
43 | 294 | raise KeyError(name) |
---|
199 | 295 | return _to_utf8(attr.value) |
---|
0 | 296 | def __setitem__(self, name, value): |
---|
9 | 297 | self._node.setAttribute(name, value) |
---|
0 | 298 | def __delitem__(self, name): |
---|
1 | 299 | self._node.removeAttribute(name) |
---|
0 | 300 | def keys(self): |
---|
54 | 301 | return [_to_utf8(key) for key in self._node.attributes.keys()] |
---|
| 302 | |
---|
0 | 303 | def __init__(self, node): |
---|
208 | 304 | self._node = node |
---|
208 | 305 | self.attr = ParsedElement._Attrs(node) |
---|
| 306 | |
---|
59 | 307 | name = property(fget=lambda self: self._node.localName, |
---|
0 | 308 | doc='Local name of the element') |
---|
3 | 309 | namespace = property(fget=lambda self: self._node.namespaceURI, |
---|
0 | 310 | doc='Namespace URI of the element') |
---|
| 311 | |
---|
0 | 312 | def children(self, name=None): |
---|
| 313 | """Iterate over the child elements of this element. |
---|
| 314 | |
---|
| 315 | If the parameter `name` is provided, only include elements with a |
---|
| 316 | matching local name. Otherwise, include all elements. |
---|
| 317 | """ |
---|
763 | 318 | for child in [c for c in self._node.childNodes if c.nodeType == 1]: |
---|
174 | 319 | if name in (None, child.tagName): |
---|
142 | 320 | yield ParsedElement(child) |
---|
| 321 | |
---|
0 | 322 | def __iter__(self): |
---|
7 | 323 | return self.children() |
---|
| 324 | |
---|
0 | 325 | def gettext(self): |
---|
| 326 | """Return the text content of this element. |
---|
| 327 | |
---|
| 328 | This concatenates the values of all text and CDATA nodes that are |
---|
| 329 | immediate children of this element. |
---|
| 330 | """ |
---|
30 | 331 | return ''.join([_to_utf8(c.nodeValue) |
---|
30 | 332 | for c in self._node.childNodes |
---|
31 | 333 | if c.nodeType in (3, 4)]) |
---|
| 334 | |
---|
0 | 335 | def write(self, out, newlines=False): |
---|
| 336 | """Serializes the element and writes the XML to the given output |
---|
| 337 | stream. |
---|
| 338 | """ |
---|
10 | 339 | out.write(self._node.toprettyxml(newl=newlines and '\n' or '', |
---|
10 | 340 | indent=newlines and '\t' or '', encoding='utf-8')) |
---|
| 341 | |
---|
0 | 342 | def __str__(self): |
---|
| 343 | """Return a string representation of the XML element.""" |
---|
9 | 344 | buf = StringIO() |
---|
9 | 345 | self.write(buf) |
---|
9 | 346 | return buf.getvalue() |
---|
| 347 | |
---|
| 348 | |
---|
0 | 349 | if __name__ == '__main__': |
---|
0 | 350 | import doctest |
---|
0 | 351 | doctest.testmod() |
---|