1 | # -*- coding: iso-8859-1 -*-
|
---|
2 | """ A SAX2 driver for libxml2, on top of it's XmlReader API
|
---|
3 |
|
---|
4 | USAGE
|
---|
5 | # put this file (drv_libxml2.py) in PYTHONPATH
|
---|
6 | import xml.sax
|
---|
7 | reader = xml.sax.make_parser(["drv_libxml2"])
|
---|
8 | # ...and the rest is standard python sax.
|
---|
9 |
|
---|
10 | CAVEATS
|
---|
11 | - Lexical handlers are supported, except for start/endEntity
|
---|
12 | (waiting for XmlReader.ResolveEntity) and start/endDTD
|
---|
13 | - Error callbacks are not exactly synchronous, they tend
|
---|
14 | to be invoked before the corresponding content callback,
|
---|
15 | because the underlying reader interface parses
|
---|
16 | data by chunks of 512 bytes
|
---|
17 |
|
---|
18 | TODO
|
---|
19 | - search for TODO
|
---|
20 | - some ErrorHandler events (warning)
|
---|
21 | - some ContentHandler events (setDocumentLocator, skippedEntity)
|
---|
22 | - EntityResolver (using libxml2.?)
|
---|
23 | - DTDHandler (if/when libxml2 exposes such node types)
|
---|
24 | - DeclHandler (if/when libxml2 exposes such node types)
|
---|
25 | - property_xml_string?
|
---|
26 | - feature_string_interning?
|
---|
27 | - Incremental parser
|
---|
28 | - additional performance tuning:
|
---|
29 | - one might cache callbacks to avoid some name lookups
|
---|
30 | - one might implement a smarter way to pass attributes to startElement
|
---|
31 | (some kind of lazy evaluation?)
|
---|
32 | - there might be room for improvement in start/endPrefixMapping
|
---|
33 | - other?
|
---|
34 |
|
---|
35 | """
|
---|
36 |
|
---|
37 | __author__ = u"Stéphane Bidoul <sbi@skynet.be>"
|
---|
38 | __version__ = "0.3"
|
---|
39 |
|
---|
40 | import codecs
|
---|
41 | from types import StringType, UnicodeType
|
---|
42 | StringTypes = (StringType,UnicodeType)
|
---|
43 |
|
---|
44 | from xml.sax._exceptions import *
|
---|
45 | from xml.sax import xmlreader, saxutils
|
---|
46 | from xml.sax.handler import \
|
---|
47 | feature_namespaces, \
|
---|
48 | feature_namespace_prefixes, \
|
---|
49 | feature_string_interning, \
|
---|
50 | feature_validation, \
|
---|
51 | feature_external_ges, \
|
---|
52 | feature_external_pes, \
|
---|
53 | property_lexical_handler, \
|
---|
54 | property_declaration_handler, \
|
---|
55 | property_dom_node, \
|
---|
56 | property_xml_string
|
---|
57 |
|
---|
58 | # libxml2 returns strings as UTF8
|
---|
59 | _decoder = codecs.lookup("utf8")[1]
|
---|
60 | def _d(s):
|
---|
61 | if s is None:
|
---|
62 | return s
|
---|
63 | else:
|
---|
64 | return _decoder(s)[0]
|
---|
65 |
|
---|
66 | try:
|
---|
67 | import libxml2
|
---|
68 | except ImportError, e:
|
---|
69 | raise SAXReaderNotAvailable("libxml2 not available: " \
|
---|
70 | "import error was: %s" % e)
|
---|
71 |
|
---|
72 | class Locator(xmlreader.Locator):
|
---|
73 | """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
|
---|
74 |
|
---|
75 | def __init__(self,locator):
|
---|
76 | self.__locator = locator
|
---|
77 |
|
---|
78 | def getColumnNumber(self):
|
---|
79 | "Return the column number where the current event ends."
|
---|
80 | return -1
|
---|
81 |
|
---|
82 | def getLineNumber(self):
|
---|
83 | "Return the line number where the current event ends."
|
---|
84 | return self.__locator.LineNumber()
|
---|
85 |
|
---|
86 | def getPublicId(self):
|
---|
87 | "Return the public identifier for the current event."
|
---|
88 | return None
|
---|
89 |
|
---|
90 | def getSystemId(self):
|
---|
91 | "Return the system identifier for the current event."
|
---|
92 | return self.__locator.BaseURI()
|
---|
93 |
|
---|
94 | class LibXml2Reader(xmlreader.XMLReader):
|
---|
95 |
|
---|
96 | def __init__(self):
|
---|
97 | xmlreader.XMLReader.__init__(self)
|
---|
98 | # features
|
---|
99 | self.__ns = 0
|
---|
100 | self.__nspfx = 0
|
---|
101 | self.__validate = 0
|
---|
102 | self.__extparams = 1
|
---|
103 | # parsing flag
|
---|
104 | self.__parsing = 0
|
---|
105 | # additional handlers
|
---|
106 | self.__lex_handler = None
|
---|
107 | self.__decl_handler = None
|
---|
108 | # error messages accumulator
|
---|
109 | self.__errors = None
|
---|
110 |
|
---|
111 | def _errorHandler(self,arg,msg,severity,locator):
|
---|
112 | if self.__errors is None:
|
---|
113 | self.__errors = []
|
---|
114 | self.__errors.append((severity,
|
---|
115 | SAXParseException(msg,None,
|
---|
116 | Locator(locator))))
|
---|
117 |
|
---|
118 | def _reportErrors(self,fatal):
|
---|
119 | for severity,exception in self.__errors:
|
---|
120 | if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
|
---|
121 | libxml2.PARSER_SEVERITY_WARNING):
|
---|
122 | self._err_handler.warning(exception)
|
---|
123 | else:
|
---|
124 | # when fatal is set, the parse will stop;
|
---|
125 | # we consider that the last error reported
|
---|
126 | # is the fatal one.
|
---|
127 | if fatal and exception is self.__errors[-1][1]:
|
---|
128 | self._err_handler.fatalError(exception)
|
---|
129 | else:
|
---|
130 | self._err_handler.error(exception)
|
---|
131 | self.__errors = None
|
---|
132 |
|
---|
133 | def parse(self, source):
|
---|
134 | self.__parsing = 1
|
---|
135 | try:
|
---|
136 | # prepare source and create reader
|
---|
137 | if type(source) in StringTypes:
|
---|
138 | reader = libxml2.newTextReaderFilename(source)
|
---|
139 | else:
|
---|
140 | source = saxutils.prepare_input_source(source)
|
---|
141 | input = libxml2.inputBuffer(source.getByteStream())
|
---|
142 | reader = input.newTextReader(source.getSystemId())
|
---|
143 | reader.SetErrorHandler(self._errorHandler,None)
|
---|
144 | # configure reader
|
---|
145 | if self.__extparams:
|
---|
146 | reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
|
---|
147 | reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
|
---|
148 | reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
|
---|
149 | reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
|
---|
150 | else:
|
---|
151 | reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
|
---|
152 | # we reuse attribute maps (for a slight performance gain)
|
---|
153 | if self.__ns:
|
---|
154 | attributesNSImpl = xmlreader.AttributesNSImpl({},{})
|
---|
155 | else:
|
---|
156 | attributesImpl = xmlreader.AttributesImpl({})
|
---|
157 | # prefixes to pop (for endPrefixMapping)
|
---|
158 | prefixes = []
|
---|
159 | # start loop
|
---|
160 | self._cont_handler.startDocument()
|
---|
161 | while 1:
|
---|
162 | r = reader.Read()
|
---|
163 | # check for errors
|
---|
164 | if r == 1:
|
---|
165 | if not self.__errors is None:
|
---|
166 | self._reportErrors(0)
|
---|
167 | elif r == 0:
|
---|
168 | if not self.__errors is None:
|
---|
169 | self._reportErrors(0)
|
---|
170 | break # end of parse
|
---|
171 | else:
|
---|
172 | if not self.__errors is None:
|
---|
173 | self._reportErrors(1)
|
---|
174 | else:
|
---|
175 | self._err_handler.fatalError(\
|
---|
176 | SAXException("Read failed (no details available)"))
|
---|
177 | break # fatal parse error
|
---|
178 | # get node type
|
---|
179 | nodeType = reader.NodeType()
|
---|
180 | # Element
|
---|
181 | if nodeType == 1:
|
---|
182 | if self.__ns:
|
---|
183 | eltName = (_d(reader.NamespaceUri()),\
|
---|
184 | _d(reader.LocalName()))
|
---|
185 | eltQName = _d(reader.Name())
|
---|
186 | attributesNSImpl._attrs = attrs = {}
|
---|
187 | attributesNSImpl._qnames = qnames = {}
|
---|
188 | newPrefixes = []
|
---|
189 | while reader.MoveToNextAttribute():
|
---|
190 | qname = _d(reader.Name())
|
---|
191 | value = _d(reader.Value())
|
---|
192 | if qname.startswith("xmlns"):
|
---|
193 | if len(qname) > 5:
|
---|
194 | newPrefix = qname[6:]
|
---|
195 | else:
|
---|
196 | newPrefix = None
|
---|
197 | newPrefixes.append(newPrefix)
|
---|
198 | self._cont_handler.startPrefixMapping(\
|
---|
199 | newPrefix,value)
|
---|
200 | if not self.__nspfx:
|
---|
201 | continue # don't report xmlns attribute
|
---|
202 | attName = (_d(reader.NamespaceUri()),
|
---|
203 | _d(reader.LocalName()))
|
---|
204 | qnames[attName] = qname
|
---|
205 | attrs[attName] = value
|
---|
206 | reader.MoveToElement()
|
---|
207 | self._cont_handler.startElementNS( \
|
---|
208 | eltName,eltQName,attributesNSImpl)
|
---|
209 | if reader.IsEmptyElement():
|
---|
210 | self._cont_handler.endElementNS(eltName,eltQName)
|
---|
211 | for newPrefix in newPrefixes:
|
---|
212 | self._cont_handler.endPrefixMapping(newPrefix)
|
---|
213 | else:
|
---|
214 | prefixes.append(newPrefixes)
|
---|
215 | else:
|
---|
216 | eltName = _d(reader.Name())
|
---|
217 | attributesImpl._attrs = attrs = {}
|
---|
218 | while reader.MoveToNextAttribute():
|
---|
219 | attName = _d(reader.Name())
|
---|
220 | attrs[attName] = _d(reader.Value())
|
---|
221 | reader.MoveToElement()
|
---|
222 | self._cont_handler.startElement( \
|
---|
223 | eltName,attributesImpl)
|
---|
224 | if reader.IsEmptyElement():
|
---|
225 | self._cont_handler.endElement(eltName)
|
---|
226 | # EndElement
|
---|
227 | elif nodeType == 15:
|
---|
228 | if self.__ns:
|
---|
229 | self._cont_handler.endElementNS( \
|
---|
230 | (_d(reader.NamespaceUri()),_d(reader.LocalName())),
|
---|
231 | _d(reader.Name()))
|
---|
232 | for prefix in prefixes.pop():
|
---|
233 | self._cont_handler.endPrefixMapping(prefix)
|
---|
234 | else:
|
---|
235 | self._cont_handler.endElement(_d(reader.Name()))
|
---|
236 | # Text
|
---|
237 | elif nodeType == 3:
|
---|
238 | self._cont_handler.characters(_d(reader.Value()))
|
---|
239 | # Whitespace
|
---|
240 | elif nodeType == 13:
|
---|
241 | self._cont_handler.ignorableWhitespace(_d(reader.Value()))
|
---|
242 | # SignificantWhitespace
|
---|
243 | elif nodeType == 14:
|
---|
244 | self._cont_handler.characters(_d(reader.Value()))
|
---|
245 | # CDATA
|
---|
246 | elif nodeType == 4:
|
---|
247 | if not self.__lex_handler is None:
|
---|
248 | self.__lex_handler.startCDATA()
|
---|
249 | self._cont_handler.characters(_d(reader.Value()))
|
---|
250 | if not self.__lex_handler is None:
|
---|
251 | self.__lex_handler.endCDATA()
|
---|
252 | # EntityReference
|
---|
253 | elif nodeType == 5:
|
---|
254 | if not self.__lex_handler is None:
|
---|
255 | self.startEntity(_d(reader.Name()))
|
---|
256 | reader.ResolveEntity()
|
---|
257 | # EndEntity
|
---|
258 | elif nodeType == 16:
|
---|
259 | if not self.__lex_handler is None:
|
---|
260 | self.endEntity(_d(reader.Name()))
|
---|
261 | # ProcessingInstruction
|
---|
262 | elif nodeType == 7:
|
---|
263 | self._cont_handler.processingInstruction( \
|
---|
264 | _d(reader.Name()),_d(reader.Value()))
|
---|
265 | # Comment
|
---|
266 | elif nodeType == 8:
|
---|
267 | if not self.__lex_handler is None:
|
---|
268 | self.__lex_handler.comment(_d(reader.Value()))
|
---|
269 | # DocumentType
|
---|
270 | elif nodeType == 10:
|
---|
271 | #if not self.__lex_handler is None:
|
---|
272 | # self.__lex_handler.startDTD()
|
---|
273 | pass # TODO (how to detect endDTD? on first non-dtd event?)
|
---|
274 | # XmlDeclaration
|
---|
275 | elif nodeType == 17:
|
---|
276 | pass # TODO
|
---|
277 | # Entity
|
---|
278 | elif nodeType == 6:
|
---|
279 | pass # TODO (entity decl)
|
---|
280 | # Notation (decl)
|
---|
281 | elif nodeType == 12:
|
---|
282 | pass # TODO
|
---|
283 | # Attribute (never in this loop)
|
---|
284 | #elif nodeType == 2:
|
---|
285 | # pass
|
---|
286 | # Document (not exposed)
|
---|
287 | #elif nodeType == 9:
|
---|
288 | # pass
|
---|
289 | # DocumentFragment (never returned by XmlReader)
|
---|
290 | #elif nodeType == 11:
|
---|
291 | # pass
|
---|
292 | # None
|
---|
293 | #elif nodeType == 0:
|
---|
294 | # pass
|
---|
295 | # -
|
---|
296 | else:
|
---|
297 | raise SAXException("Unexpected node type %d" % nodeType)
|
---|
298 | if r == 0:
|
---|
299 | self._cont_handler.endDocument()
|
---|
300 | reader.Close()
|
---|
301 | finally:
|
---|
302 | self.__parsing = 0
|
---|
303 |
|
---|
304 | def setDTDHandler(self, handler):
|
---|
305 | # TODO (when supported, the inherited method works just fine)
|
---|
306 | raise SAXNotSupportedException("DTDHandler not supported")
|
---|
307 |
|
---|
308 | def setEntityResolver(self, resolver):
|
---|
309 | # TODO (when supported, the inherited method works just fine)
|
---|
310 | raise SAXNotSupportedException("EntityResolver not supported")
|
---|
311 |
|
---|
312 | def getFeature(self, name):
|
---|
313 | if name == feature_namespaces:
|
---|
314 | return self.__ns
|
---|
315 | elif name == feature_namespace_prefixes:
|
---|
316 | return self.__nspfx
|
---|
317 | elif name == feature_validation:
|
---|
318 | return self.__validate
|
---|
319 | elif name == feature_external_ges:
|
---|
320 | return 1 # TODO (does that relate to PARSER_LOADDTD)?
|
---|
321 | elif name == feature_external_pes:
|
---|
322 | return self.__extparams
|
---|
323 | else:
|
---|
324 | raise SAXNotRecognizedException("Feature '%s' not recognized" % \
|
---|
325 | name)
|
---|
326 |
|
---|
327 | def setFeature(self, name, state):
|
---|
328 | if self.__parsing:
|
---|
329 | raise SAXNotSupportedException("Cannot set feature %s " \
|
---|
330 | "while parsing" % name)
|
---|
331 | if name == feature_namespaces:
|
---|
332 | self.__ns = state
|
---|
333 | elif name == feature_namespace_prefixes:
|
---|
334 | self.__nspfx = state
|
---|
335 | elif name == feature_validation:
|
---|
336 | self.__validate = state
|
---|
337 | elif name == feature_external_ges:
|
---|
338 | if state == 0:
|
---|
339 | # TODO (does that relate to PARSER_LOADDTD)?
|
---|
340 | raise SAXNotSupportedException("Feature '%s' not supported" % \
|
---|
341 | name)
|
---|
342 | elif name == feature_external_pes:
|
---|
343 | self.__extparams = state
|
---|
344 | else:
|
---|
345 | raise SAXNotRecognizedException("Feature '%s' not recognized" % \
|
---|
346 | name)
|
---|
347 |
|
---|
348 | def getProperty(self, name):
|
---|
349 | if name == property_lexical_handler:
|
---|
350 | return self.__lex_handler
|
---|
351 | elif name == property_declaration_handler:
|
---|
352 | return self.__decl_handler
|
---|
353 | else:
|
---|
354 | raise SAXNotRecognizedException("Property '%s' not recognized" % \
|
---|
355 | name)
|
---|
356 |
|
---|
357 | def setProperty(self, name, value):
|
---|
358 | if name == property_lexical_handler:
|
---|
359 | self.__lex_handler = value
|
---|
360 | elif name == property_declaration_handler:
|
---|
361 | # TODO: remove if/when libxml2 supports dtd events
|
---|
362 | raise SAXNotSupportedException("Property '%s' not supported" % \
|
---|
363 | name)
|
---|
364 | self.__decl_handler = value
|
---|
365 | else:
|
---|
366 | raise SAXNotRecognizedException("Property '%s' not recognized" % \
|
---|
367 | name)
|
---|
368 |
|
---|
369 | def create_parser():
|
---|
370 | return LibXml2Reader()
|
---|
371 |
|
---|