1   
  2   
  3   
  4   
  5   
  6   
  7   
  8   
  9   
 10   
 11   
 12   
 13   
 14   
 15   
 16   
 17   
 18   
 19   
 20   
 21   
 22   
 23   
 24   
 25   
 26   
 27   
 28   
 29   
 30   
 31   
 32   
 33   
 34   
 35   
 36   
 37   
 38   
 39   
 40   
 41  """ 
 42  Provides general XML-related functionality. 
 43   
 44  What I'm trying to do here is abstract much of the functionality that directly 
 45  accesses the DOM tree.  This is not so much to "protect" the other code from 
 46  the DOM, but to standardize the way it's used.  It will also help extension 
 47  authors write code that easily looks more like the rest of Cedar Backup. 
 48   
 49  @sort: createInputDom, createOutputDom, serializeDom, isElement, readChildren, 
 50         readFirstChild, readStringList, readString, readInteger, readBoolean, 
 51         addContainerNode, addStringNode, addIntegerNode, addBooleanNode, 
 52         TRUE_BOOLEAN_VALUES, FALSE_BOOLEAN_VALUES, VALID_BOOLEAN_VALUES 
 53   
 54  @var TRUE_BOOLEAN_VALUES: List of boolean values in XML representing C{True}. 
 55  @var FALSE_BOOLEAN_VALUES: List of boolean values in XML representing C{False}. 
 56  @var VALID_BOOLEAN_VALUES: List of valid boolean values in XML. 
 57   
 58  @author: Kenneth J. Pronovici <pronovic@ieee.org> 
 59  """ 
 60   
 61   
 62   
 63   
 64   
 65   
 66   
 67  import sys 
 68  import re 
 69  import logging 
 70  import codecs 
 71  from types import UnicodeType 
 72  from StringIO import StringIO 
 73   
 74   
 75  from xml.parsers.expat import ExpatError 
 76  from xml.dom.minidom import Node 
 77  from xml.dom.minidom import getDOMImplementation 
 78  from xml.dom.minidom import parseString 
 79   
 80   
 81   
 82   
 83   
 84   
 85  logger = logging.getLogger("CedarBackup2.log.xml") 
 86   
 87  TRUE_BOOLEAN_VALUES   = [ "Y", "y", ] 
 88  FALSE_BOOLEAN_VALUES  = [ "N", "n", ] 
 89  VALID_BOOLEAN_VALUES  = TRUE_BOOLEAN_VALUES + FALSE_BOOLEAN_VALUES 
 90   
 91   
 92   
 93   
 94   
 95   
109   
111     """ 
112     Creates a DOM tree used for writing an XML document. 
113     @param name: Base name of the document (root node name). 
114     @return: Tuple (xmlDom, parentNode) for the new document 
115     """ 
116     impl = getDOMImplementation() 
117     xmlDom = impl.createDocument(None, name, None) 
118     return (xmlDom, xmlDom.documentElement) 
 119   
120   
121   
122   
123   
124   
126     """ 
127     Returns True or False depending on whether the XML node is an element node. 
128     """ 
129     return node.nodeType == Node.ELEMENT_NODE 
 130   
132     """ 
133     Returns a list of nodes with a given name immediately beneath the 
134     parent. 
135   
136     By "immediately beneath" the parent, we mean from among nodes that are 
137     direct children of the passed-in parent node. 
138   
139     Underneath, we use the Python C{getElementsByTagName} method, which is 
140     pretty cool, but which (surprisingly?) returns a list of all children 
141     with a given name below the parent, at any level.  We just prune that 
142     list to include only children whose C{parentNode} matches the passed-in 
143     parent. 
144   
145     @param parent: Parent node to search beneath. 
146     @param name: Name of nodes to search for. 
147   
148     @return: List of child nodes with correct parent, or an empty list if 
149     no matching nodes are found. 
150     """ 
151     lst = [] 
152     if parent is not None: 
153        result = parent.getElementsByTagName(name) 
154        for entry in result: 
155           if entry.parentNode is parent: 
156              lst.append(entry) 
157     return lst 
 158   
160     """ 
161     Returns the first child with a given name immediately beneath the parent. 
162   
163     By "immediately beneath" the parent, we mean from among nodes that are 
164     direct children of the passed-in parent node. 
165   
166     @param parent: Parent node to search beneath. 
167     @param name: Name of node to search for. 
168   
169     @return: First properly-named child of parent, or C{None} if no matching nodes are found. 
170     """ 
171     result = readChildren(parent, name) 
172     if result is None or result == []: 
173        return None 
174     return result[0] 
 175   
177     """ 
178     Returns a list of the string contents associated with nodes with a given 
179     name immediately beneath the parent. 
180   
181     By "immediately beneath" the parent, we mean from among nodes that are 
182     direct children of the passed-in parent node. 
183   
184     First, we find all of the nodes using L{readChildren}, and then we 
185     retrieve the "string contents" of each of those nodes.  The returned list 
186     has one entry per matching node.  We assume that string contents of a 
187     given node belong to the first C{TEXT_NODE} child of that node.  Nodes 
188     which have no C{TEXT_NODE} children are not represented in the returned 
189     list. 
190   
191     @param parent: Parent node to search beneath. 
192     @param name: Name of node to search for. 
193   
194     @return: List of strings as described above, or C{None} if no matching nodes are found. 
195     """ 
196     lst = [] 
197     result = readChildren(parent, name) 
198     for entry in result: 
199        if entry.hasChildNodes(): 
200           for child in entry.childNodes: 
201              if child.nodeType == Node.TEXT_NODE: 
202                 lst.append(child.nodeValue) 
203                 break 
204     if lst == []: 
205        lst = None 
206     return lst 
 207   
209     """ 
210     Returns string contents of the first child with a given name immediately 
211     beneath the parent. 
212   
213     By "immediately beneath" the parent, we mean from among nodes that are 
214     direct children of the passed-in parent node.  We assume that string 
215     contents of a given node belong to the first C{TEXT_NODE} child of that 
216     node. 
217   
218     @param parent: Parent node to search beneath. 
219     @param name: Name of node to search for. 
220   
221     @return: String contents of node or C{None} if no matching nodes are found. 
222     """ 
223     result = readStringList(parent, name) 
224     if result is None: 
225        return None 
226     return result[0] 
 227   
229     """ 
230     Returns integer contents of the first child with a given name immediately 
231     beneath the parent. 
232   
233     By "immediately beneath" the parent, we mean from among nodes that are 
234     direct children of the passed-in parent node. 
235   
236     @param parent: Parent node to search beneath. 
237     @param name: Name of node to search for. 
238   
239     @return: Integer contents of node or C{None} if no matching nodes are found. 
240     @raise ValueError: If the string at the location can't be converted to an integer. 
241     """ 
242     result = readString(parent, name) 
243     if result is None: 
244        return None 
245     else: 
246        return int(result) 
 247   
249     """ 
250     Returns long integer contents of the first child with a given name immediately 
251     beneath the parent. 
252   
253     By "immediately beneath" the parent, we mean from among nodes that are 
254     direct children of the passed-in parent node. 
255   
256     @param parent: Parent node to search beneath. 
257     @param name: Name of node to search for. 
258   
259     @return: Long integer contents of node or C{None} if no matching nodes are found. 
260     @raise ValueError: If the string at the location can't be converted to an integer. 
261     """ 
262     result = readString(parent, name) 
263     if result is None: 
264        return None 
265     else: 
266        return long(result) 
 267   
269     """ 
270     Returns float contents of the first child with a given name immediately 
271     beneath the parent. 
272   
273     By "immediately beneath" the parent, we mean from among nodes that are 
274     direct children of the passed-in parent node. 
275   
276     @param parent: Parent node to search beneath. 
277     @param name: Name of node to search for. 
278   
279     @return: Float contents of node or C{None} if no matching nodes are found. 
280     @raise ValueError: If the string at the location can't be converted to a 
281     float value. 
282     """ 
283     result = readString(parent, name) 
284     if result is None: 
285        return None 
286     else: 
287        return float(result) 
 288   
290     """ 
291     Returns boolean contents of the first child with a given name immediately 
292     beneath the parent. 
293   
294     By "immediately beneath" the parent, we mean from among nodes that are 
295     direct children of the passed-in parent node. 
296   
297     The string value of the node must be one of the values in L{VALID_BOOLEAN_VALUES}. 
298   
299     @param parent: Parent node to search beneath. 
300     @param name: Name of node to search for. 
301   
302     @return: Boolean contents of node or C{None} if no matching nodes are found. 
303     @raise ValueError: If the string at the location can't be converted to a boolean. 
304     """ 
305     result = readString(parent, name) 
306     if result is None: 
307        return None 
308     else: 
309        if result in TRUE_BOOLEAN_VALUES: 
310           return True 
311        elif result in FALSE_BOOLEAN_VALUES: 
312           return False 
313        else: 
314           raise ValueError("Boolean values must be one of %s." % VALID_BOOLEAN_VALUES) 
 315   
316   
317   
318   
319   
320   
322     """ 
323     Adds a container node as the next child of a parent node. 
324   
325     @param xmlDom: DOM tree as from C{impl.createDocument()}. 
326     @param parentNode: Parent node to create child for. 
327     @param nodeName: Name of the new container node. 
328   
329     @return: Reference to the newly-created node. 
330     """ 
331     containerNode = xmlDom.createElement(nodeName) 
332     parentNode.appendChild(containerNode) 
333     return containerNode 
 334   
336     """ 
337     Adds a text node as the next child of a parent, to contain a string. 
338   
339     If the C{nodeValue} is None, then the node will be created, but will be 
340     empty (i.e. will contain no text node child). 
341   
342     @param xmlDom: DOM tree as from C{impl.createDocument()}. 
343     @param parentNode: Parent node to create child for. 
344     @param nodeName: Name of the new container node. 
345     @param nodeValue: The value to put into the node. 
346   
347     @return: Reference to the newly-created node. 
348     """ 
349     containerNode = addContainerNode(xmlDom, parentNode, nodeName) 
350     if nodeValue is not None: 
351        textNode = xmlDom.createTextNode(nodeValue) 
352        containerNode.appendChild(textNode) 
353     return containerNode 
 354   
356     """ 
357     Adds a text node as the next child of a parent, to contain an integer. 
358   
359     If the C{nodeValue} is None, then the node will be created, but will be 
360     empty (i.e. will contain no text node child). 
361   
362     The integer will be converted to a string using "%d".  The result will be 
363     added to the document via L{addStringNode}. 
364   
365     @param xmlDom: DOM tree as from C{impl.createDocument()}. 
366     @param parentNode: Parent node to create child for. 
367     @param nodeName: Name of the new container node. 
368     @param nodeValue: The value to put into the node. 
369   
370     @return: Reference to the newly-created node. 
371     """ 
372     if nodeValue is None: 
373        return addStringNode(xmlDom, parentNode, nodeName, None) 
374     else: 
375        return addStringNode(xmlDom, parentNode, nodeName, "%d" % nodeValue)  
 376   
377 -def addLongNode(xmlDom, parentNode, nodeName, nodeValue): 
 378     """ 
379     Adds a text node as the next child of a parent, to contain a long integer. 
380   
381     If the C{nodeValue} is None, then the node will be created, but will be 
382     empty (i.e. will contain no text node child). 
383   
384     The integer will be converted to a string using "%d".  The result will be 
385     added to the document via L{addStringNode}. 
386   
387     @param xmlDom: DOM tree as from C{impl.createDocument()}. 
388     @param parentNode: Parent node to create child for. 
389     @param nodeName: Name of the new container node. 
390     @param nodeValue: The value to put into the node. 
391   
392     @return: Reference to the newly-created node. 
393     """ 
394     if nodeValue is None: 
395        return addStringNode(xmlDom, parentNode, nodeName, None) 
396     else: 
397        return addStringNode(xmlDom, parentNode, nodeName, "%d" % nodeValue)  
 398   
400     """ 
401     Adds a text node as the next child of a parent, to contain a boolean. 
402   
403     If the C{nodeValue} is None, then the node will be created, but will be 
404     empty (i.e. will contain no text node child). 
405   
406     Boolean C{True}, or anything else interpreted as C{True} by Python, will 
407     be converted to a string "Y".  Anything else will be converted to a 
408     string "N".  The result is added to the document via L{addStringNode}. 
409   
410     @param xmlDom: DOM tree as from C{impl.createDocument()}. 
411     @param parentNode: Parent node to create child for. 
412     @param nodeName: Name of the new container node. 
413     @param nodeValue: The value to put into the node. 
414   
415     @return: Reference to the newly-created node. 
416     """ 
417     if nodeValue is None: 
418        return addStringNode(xmlDom, parentNode, nodeName, None) 
419     else: 
420        if nodeValue: 
421           return addStringNode(xmlDom, parentNode, nodeName, "Y") 
422        else: 
423           return addStringNode(xmlDom, parentNode, nodeName, "N") 
 424   
425   
426   
427   
428   
429   
431     """ 
432     Serializes a DOM tree and returns the result in a string. 
433     @param xmlDom: XML DOM tree to serialize 
434     @param indent: Number of spaces to indent, as an integer 
435     @return: String form of DOM tree, pretty-printed. 
436     """ 
437     xmlBuffer = StringIO() 
438     serializer = Serializer(xmlBuffer, "UTF-8", indent=indent) 
439     serializer.serialize(xmlDom) 
440     xmlData = xmlBuffer.getvalue() 
441     xmlBuffer.close() 
442     return xmlData 
 443   
445   
446     """ 
447     XML serializer class. 
448   
449     This is a customized serializer that I hacked together based on what I found 
450     in the PyXML distribution.  Basically, around release 2.7.0, the only reason 
451     I still had around a dependency on PyXML was for the PrettyPrint 
452     functionality, and that seemed pointless.  So, I stripped the PrettyPrint 
453     code out of PyXML and hacked bits of it off until it did just what I needed 
454     and no more. 
455   
456     This code started out being called PrintVisitor, but I decided it makes more 
457     sense just calling it a serializer.  I've made nearly all of the methods 
458     private, and I've added a new high-level serialize() method rather than 
459     having clients call C{visit()}. 
460   
461     Anyway, as a consequence of my hacking with it, this can't quite be called a 
462     complete XML serializer any more.  I ripped out support for HTML and XHTML, 
463     and there is also no longer any support for namespaces (which I took out 
464     because this dragged along a lot of extra code, and Cedar Backup doesn't use 
465     namespaces).  However, everything else should pretty much work as expected. 
466   
467     @copyright: This code, prior to customization, was part of the PyXML 
468     codebase, and before that was part of the 4DOM suite developed by 
469     Fourthought, Inc.  It its original form, it was Copyright (c) 2000 
470     Fourthought Inc, USA; All Rights Reserved. 
471     """ 
472   
473 -   def __init__(self, stream=sys.stdout, encoding="UTF-8", indent=3): 
 474        """ 
475        Initialize a serializer. 
476        @param stream: Stream to write output to. 
477        @param encoding: Output encoding. 
478        @param indent: Number of spaces to indent, as an integer 
479        """ 
480        self.stream = stream 
481        self.encoding = encoding 
482        self._indent = indent * " " 
483        self._depth = 0 
484        self._inText = 0 
 485   
487        """ 
488        Serialize the passed-in XML document. 
489        @param xmlDom: XML DOM tree to serialize 
490        @raise ValueError: If there's an unknown node type in the document. 
491        """ 
492        self._visit(xmlDom) 
493        self.stream.write("\n") 
 494   
499   
501        if not self._inText and self._indent: 
502           self._write('\n' + self._indent*self._depth) 
503        return 
 504   
506        """ 
507        @raise ValueError: If there's an unknown node type in the document. 
508        """ 
509        if node.nodeType == Node.ELEMENT_NODE: 
510           return self._visitElement(node) 
511   
512        elif node.nodeType == Node.ATTRIBUTE_NODE: 
513           return self._visitAttr(node) 
514   
515        elif node.nodeType == Node.TEXT_NODE: 
516           return self._visitText(node) 
517   
518        elif node.nodeType == Node.CDATA_SECTION_NODE: 
519           return self._visitCDATASection(node) 
520   
521        elif node.nodeType == Node.ENTITY_REFERENCE_NODE: 
522           return self._visitEntityReference(node) 
523   
524        elif node.nodeType == Node.ENTITY_NODE: 
525           return self._visitEntity(node) 
526   
527        elif node.nodeType == Node.PROCESSING_INSTRUCTION_NODE: 
528           return self._visitProcessingInstruction(node) 
529   
530        elif node.nodeType == Node.COMMENT_NODE: 
531           return self._visitComment(node) 
532   
533        elif node.nodeType == Node.DOCUMENT_NODE: 
534           return self._visitDocument(node) 
535   
536        elif node.nodeType == Node.DOCUMENT_TYPE_NODE: 
537           return self._visitDocumentType(node) 
538   
539        elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE: 
540           return self._visitDocumentFragment(node) 
541   
542        elif node.nodeType == Node.NOTATION_NODE: 
543           return self._visitNotation(node) 
544   
545         
546        raise ValueError("Unknown node type: %s" % repr(node)) 
 547   
549        for curr in node: 
550           curr is not exclude and self._visit(curr) 
551        return 
 552   
554        for item in node.values(): 
555           self._visit(item) 
556        return 
 557   
565   
567        self._write("<?xml version='1.0' encoding='%s'?>" % (self.encoding or 'utf-8')) 
568        self._inText = 0 
569        return 
 570   
576   
580   
582        self._tryIndent() 
583        self._write('<%s' % node.tagName) 
584        for attr in node.attributes.values(): 
585           self._visitAttr(attr) 
586        if len(node.childNodes): 
587           self._write('>') 
588           self._depth = self._depth + 1 
589           self._visitNodeList(node.childNodes) 
590           self._depth = self._depth - 1 
591           not (self._inText) and self._tryIndent() 
592           self._write('</%s>' % node.tagName) 
593        else: 
594           self._write('/>') 
595        self._inText = 0 
596        return 
 597   
598 -   def _visitText(self, node): 
 599        text = node.data 
600        if self._indent: 
601           text.strip() 
602        if text: 
603           text = _translateCDATA(text, self.encoding) 
604           self.stream.write(text) 
605           self._inText = 1 
606        return 
 607   
609        if not doctype.systemId and not doctype.publicId: return 
610        self._tryIndent() 
611        self._write('<!DOCTYPE %s' % doctype.name) 
612        if doctype.systemId and '"' in doctype.systemId: 
613           system = "'%s'" % doctype.systemId 
614        else: 
615           system = '"%s"' % doctype.systemId 
616        if doctype.publicId and '"' in doctype.publicId: 
617            
618            
619            
620           public = "'%s'" % doctype.publicId 
621        else: 
622           public = '"%s"' % doctype.publicId 
623        if doctype.publicId and doctype.systemId: 
624           self._write(' PUBLIC %s %s' % (public, system)) 
625        elif doctype.systemId: 
626           self._write(' SYSTEM %s' % system) 
627        if doctype.entities or doctype.notations: 
628           self._write(' [') 
629           self._depth = self._depth + 1 
630           self._visitNamedNodeMap(doctype.entities) 
631           self._visitNamedNodeMap(doctype.notations) 
632           self._depth = self._depth - 1 
633           self._tryIndent() 
634           self._write(']>') 
635        else: 
636           self._write('>') 
637        self._inText = 0 
638        return 
 639   
641        """Visited from a NamedNodeMap in DocumentType""" 
642        self._tryIndent() 
643        self._write('<!ENTITY %s' % (node.nodeName)) 
644        node.publicId and self._write(' PUBLIC %s' % node.publicId) 
645        node.systemId and self._write(' SYSTEM %s' % node.systemId) 
646        node.notationName and self._write(' NDATA %s' % node.notationName) 
647        self._write('>') 
648        return 
 649   
651        """Visited from a NamedNodeMap in DocumentType""" 
652        self._tryIndent() 
653        self._write('<!NOTATION %s' % node.nodeName) 
654        node.publicId and self._write(' PUBLIC %s' % node.publicId) 
655        node.systemId and self._write(' SYSTEM %s' % node.systemId) 
656        self._write('>') 
657        return 
 658   
660        self._tryIndent() 
661        self._write('<![CDATA[%s]]>' % (node.data)) 
662        self._inText = 0 
663        return 
 664   
670   
672        self._write('&%s;' % node.nodeName) 
673        self._inText = 1 
674        return 
 675   
677        self._tryIndent() 
678        self._write('<?%s %s?>' % (node.target, node.data)) 
679        self._inText = 0 
680        return 
  681   
682 -def _encodeText(text, encoding): 
 683     """ 
684     @copyright: This code, prior to customization, was part of the PyXML 
685     codebase, and before that was part of the 4DOM suite developed by 
686     Fourthought, Inc.  It its original form, it was attributed to Martin v. 
687     Löwis and was Copyright (c) 2000 Fourthought Inc, USA; All Rights Reserved. 
688     """ 
689     encoder = codecs.lookup(encoding)[0]  
690     if not isinstance(text, UnicodeType): 
691        text = unicode(text, "utf-8") 
692     return encoder(text)[0]  
 693   
695     """ 
696     Handles normalization and some intelligence about quoting. 
697   
698     @copyright: This code, prior to customization, was part of the PyXML 
699     codebase, and before that was part of the 4DOM suite developed by 
700     Fourthought, Inc.  It its original form, it was Copyright (c) 2000 
701     Fourthought Inc, USA; All Rights Reserved. 
702     """ 
703     if not characters: 
704        return '', "'" 
705     if "'" in characters: 
706        delimiter = '"' 
707        new_chars = re.sub('"', '"', characters) 
708     else: 
709        delimiter = "'" 
710        new_chars = re.sub("'", ''', characters) 
711      
712      
713      
714     if "\n" in characters: 
715        new_chars = re.sub('\n', '
', new_chars) 
716     return new_chars, delimiter 
 717   
718   
719 -def _translateCDATA(characters, encoding='UTF-8', prev_chars='', markupSafe=0): 
 720     """ 
721     @copyright: This code, prior to customization, was part of the PyXML 
722     codebase, and before that was part of the 4DOM suite developed by 
723     Fourthought, Inc.  It its original form, it was Copyright (c) 2000 
724     Fourthought Inc, USA; All Rights Reserved. 
725     """ 
726     CDATA_CHAR_PATTERN = re.compile('[&<]|]]>') 
727     CHAR_TO_ENTITY = { '&': '&', '<': '<', ']]>': ']]>', } 
728     ILLEGAL_LOW_CHARS = '[\x01-\x08\x0B-\x0C\x0E-\x1F]' 
729     ILLEGAL_HIGH_CHARS = '\xEF\xBF[\xBE\xBF]' 
730     XML_ILLEGAL_CHAR_PATTERN = re.compile('%s|%s'%(ILLEGAL_LOW_CHARS, ILLEGAL_HIGH_CHARS)) 
731     if not characters: 
732        return '' 
733     if not markupSafe: 
734        if CDATA_CHAR_PATTERN.search(characters): 
735           new_string = CDATA_CHAR_PATTERN.subn(lambda m, d=CHAR_TO_ENTITY: d[m.group()], characters)[0] 
736        else: 
737           new_string = characters 
738        if prev_chars[-2:] == ']]' and characters[0] == '>': 
739           new_string = '>' + new_string[1:] 
740     else: 
741        new_string = characters 
742      
743      
744      
745     if XML_ILLEGAL_CHAR_PATTERN.search(new_string): 
746        new_string = XML_ILLEGAL_CHAR_PATTERN.subn(lambda m: '&#%i;' % ord(m.group()), new_string)[0] 
747     new_string = _encodeText(new_string, encoding) 
748     return new_string 
 749