1
"""Radically simple xml parsing
5
<this type="xml">text <b>in</b> xml</this>
10
("b", None, ["in"], None),
22
Ie, xml tag translates to a tuple:
23
(name, dictofattributes, contentlist, miscellaneousinfo)
25
where miscellaneousinfo can be anything, (but defaults to None)
26
(with the intention of adding, eg, line number information)
28
special cases: name of "" means "top level, no containing tag".
29
Top level parse always looks like this
31
("", list, None, None)
33
contained text of None means <simple_tag\>
35
In order to support stuff like
37
<this></this><one></one>
39
AT THE MOMENT & ETCETERA ARE IGNORED. THEY MUST BE PROCESSED
40
IN A POST-PROCESSING STEP.
42
PROLOGUES ARE NOT UNDERSTOOD. OTHER STUFF IS PROBABLY MISSING.
45
RequirePyRXP = 0 # set this to 1 to disable the nonvalidating fallback parser.
49
#raise ImportError, "dummy error"
52
if pyRXP.version>='0.5':
55
pyRXP_parser = pyRXP.Parser(
56
ErrorOnValidityErrors=1,
58
ExpandCharacterEntities=0,
59
ExpandGeneralEntities=0,
61
srcName='string input')
62
def parsexml(xmlText, oneOutermostTag=0,eoCB=None,entityReplacer=None):
63
pyRXP_parser.eoCB = eoCB
64
p = pyRXP_parser.parse(xmlText)
65
return oneOutermostTag and p or ('',None,[p],None)
67
def parsexml(xmlText, oneOutermostTag=0,eoCB=None,entityReplacer=None):
68
'''eoCB is the entity open callback'''
71
flags = 0x0157e1ff | pyRXP.PARSER_FLAGS['ErrorOnValidityErrors']
72
for k in ('ExpandCharacterEntities','ExpandGeneralEntities'):
73
flags = flags & (~pyRXP.PARSER_FLAGS[k])
74
p = pyRXP.parse(xmlText,srcName='string input',flags=flags,warnCB=warnCB,eoCB=eoCB)
75
return oneOutermostTag and p or ('',None,[p],None)
82
CDATAMARKER = "<![CDATA["
83
LENCDATAMARKER = len(CDATAMARKER)
84
CDATAENDMARKER = "]]>"
85
replacelist = [("<", "<"), (">", ">"), ("&", "&")] # amp must be last
87
def unEscapeContentList(contentList):
89
from string import replace
92
for (old, new) in replacelist:
93
e = replace(e, old, new)
97
def parsexmlSimple(xmltext, oneOutermostTag=0,eoCB=None,entityReplacer=unEscapeContentList):
98
"""official interface: discard unused cursor info"""
100
raise ImportError, "pyRXP not found, fallback parser disabled"
101
(result, cursor) = parsexml0(xmltext,entityReplacer=entityReplacer)
108
parsexml = parsexmlSimple
110
def parseFile(filename):
111
raw = open(filename, 'r').read()
116
def skip_prologue(text, cursor):
117
"""skip any prologue found after cursor, return index of rest of text"""
118
### NOT AT ALL COMPLETE!!! definitely can be confused!!!
119
from string import find
120
prologue_elements = ("!DOCTYPE", "?xml", "!--")
123
#print "trying to skip:", repr(text[cursor:cursor+20])
124
openbracket = find(text, "<", cursor)
125
if openbracket<0: break
128
for e in prologue_elements:
130
if text[past:past+le]==e:
132
cursor = find(text, ">", past)
134
raise ValueError, "can't close prologue %s" % `e`
138
#print "done skipping"
141
def parsexml0(xmltext, startingat=0, toplevel=1,
142
# snarf in some globals
143
strip=string.strip, split=string.split, find=string.find, entityReplacer=unEscapeContentList,
145
#LENCDATAMARKER=LENCDATAMARKER, CDATAMARKER=CDATAMARKER
147
"""simple recursive descent xml parser...
148
return (dictionary, endcharacter)
149
special case: comment returns (None, endcharacter)"""
150
#from string import strip, split, find
151
#print "parsexml0", `xmltext[startingat: startingat+10]`
154
ContentList = AttDict = ExtraStuff = None
155
if toplevel is not None:
156
#if verbose: print "at top level"
158
# raise ValueError, "have to start at 0 for top level!"
159
xmltext = strip(xmltext)
161
#look for interesting starting points
162
firstbracket = find(xmltext, "<", cursor)
163
afterbracket2char = xmltext[firstbracket+1:firstbracket+3]
164
#print "a", `afterbracket2char`
165
#firstampersand = find(xmltext, "&", cursor)
166
#if firstampersand>0 and firstampersand<firstbracket:
167
# raise ValueError, "I don't handle ampersands yet!!!"
171
#if verbose: print "no tags"
172
if toplevel is not None:
173
#D = {NAMEKEY: NONAME, CONTENTSKEY: [xmltext[cursor:]]}
174
ContentList = [xmltext[cursor:]]
175
if entityReplacer: ContentList = entityReplacer(ContentList)
176
return (NameString, AttDict, ContentList, ExtraStuff), len(xmltext)
178
raise ValueError, "no tags at non-toplevel %s" % `xmltext[cursor:cursor+20]`
182
# NEED to force always outer level is unnamed!!!
183
#if toplevel and firstbracket>0:
184
#afterbracket2char = xmltext[firstbracket:firstbracket+2]
185
if toplevel is not None:
186
#print "toplevel with no outer tag"
187
NameString = name = NONAME
188
cursor = skip_prologue(xmltext, cursor)
191
raise ValueError, "non top level entry should be at start tag: %s" % repr(xmltext[:10])
192
# special case: CDATA
193
elif afterbracket2char=="![" and xmltext[firstbracket:firstbracket+9]=="<![CDATA[":
194
#print "in CDATA", cursor
195
# skip straight to the close marker
196
startcdata = firstbracket+9
197
endcdata = find(xmltext, CDATAENDMARKER, startcdata)
199
raise ValueError, "unclosed CDATA %s" % repr(xmltext[cursor:cursor+20])
200
NameString = CDATAMARKER
201
ContentList = [xmltext[startcdata: endcdata]]
202
cursor = endcdata+len(CDATAENDMARKER)
204
# special case COMMENT
205
elif afterbracket2char=="!-" and xmltext[firstbracket:firstbracket+4]=="<!--":
207
endcommentdashes = find(xmltext, "--", firstbracket+4)
208
if endcommentdashes<firstbracket:
209
raise ValueError, "unterminated comment %s" % repr(xmltext[cursor:cursor+20])
210
endcomment = endcommentdashes+2
211
if xmltext[endcomment]!=">":
212
raise ValueError, "invalid comment: contains double dashes %s" % repr(xmltext[cursor:cursor+20])
213
return (None, endcomment+1) # shortcut exit
215
# get the rest of the tag
216
#if verbose: print "parsing start tag"
217
# make sure the tag isn't in doublequote pairs
218
closebracket = find(xmltext, ">", firstbracket)
219
noclose = closebracket<0
220
startsearch = closebracket+1
221
pastfirstbracket = firstbracket+1
222
tagcontent = xmltext[pastfirstbracket:closebracket]
223
# shortcut, no equal means nothing but name in the tag content
224
if '=' not in tagcontent:
225
if tagcontent[-1]=="/":
227
#print "simple case", tagcontent
228
tagcontent = tagcontent[:-1]
230
name = strip(tagcontent)
234
if '"' in tagcontent:
235
# check double quotes
237
# not inside double quotes! (the split should have odd length)
238
if noclose or len(split(tagcontent+".", '"'))% 2:
241
closebracket = find(xmltext, ">", startsearch)
242
startsearch = closebracket+1
243
noclose = closebracket<0
244
tagcontent = xmltext[pastfirstbracket:closebracket]
245
# not inside double quotes! (the split should have odd length)
246
if noclose or len(split(tagcontent+".", '"'))% 2:
249
raise ValueError, "unclosed start tag %s" % repr(xmltext[firstbracket:firstbracket+20])
251
#cursor = closebracket+1
252
# handle simple tag /> syntax
253
if xmltext[closebracket-1]=="/":
254
#if verbose: print "it's a simple tag"
255
closebracket = closebracket-1
256
tagcontent = tagcontent[:-1]
258
#tagcontent = xmltext[firstbracket+1:closebracket]
259
tagcontent = strip(tagcontent)
260
taglist = split(tagcontent, "=")
262
# raise ValueError, "tag with no name %s" % repr(xmltext[firstbracket:firstbracket+20])
263
taglist0 = taglist[0]
264
taglist0list = split(taglist0)
265
#if len(taglist0list)>2:
266
# raise ValueError, "bad tag head %s" % repr(taglist0)
267
name = taglist0list[0]
268
#print "tag name is", name
270
# now parse the attributes
271
attributename = taglist0list[-1]
272
# put a fake att name at end of last taglist entry for consistent parsing
273
taglist[-1] = taglist[-1]+" f"
276
lasttaglistindex = len(taglist)
277
#for attentry in taglist[1:]:
278
while taglistindex<lasttaglistindex:
279
#print "looking for attribute named", attributename
280
attentry = taglist[taglistindex]
281
taglistindex = taglistindex+1
282
attentry = strip(attentry)
284
raise ValueError, "attribute value must start with double quotes" + repr(attentry)
285
while '"' not in attentry[1:]:
286
# must have an = inside the attribute value...
287
if taglistindex>lasttaglistindex:
288
raise ValueError, "unclosed value " + repr(attentry)
289
nextattentry = taglist[taglistindex]
290
taglistindex = taglistindex+1
291
attentry = "%s=%s" % (attentry, nextattentry)
292
attentry = strip(attentry) # only needed for while loop...
293
attlist = split(attentry)
294
nextattname = attlist[-1]
295
attvalue = attentry[:-len(nextattname)]
296
attvalue = strip(attvalue)
298
first = attvalue[0]; last=attvalue[-1]
300
raise ValueError, "attvalue,attentry,attlist="+repr((attvalue, attentry,attlist))
301
if first==last=='"' or first==last=="'":
302
attvalue = attvalue[1:-1]
303
#print attributename, "=", attvalue
304
D[attributename] = attvalue
305
attributename = nextattname
306
# pass over other tags and content looking for end tag
307
if docontents is not None:
308
#print "now looking for end tag"
310
while docontents is not None:
311
nextopenbracket = find(xmltext, "<", cursor)
312
if nextopenbracket<cursor:
313
#if verbose: print "no next open bracket found"
315
#print "no more tags for noname", repr(xmltext[cursor:cursor+10])
316
docontents=None # done
317
remainder = xmltext[cursor:]
318
cursor = len(xmltext)
322
raise ValueError, "no close bracket for %s found after %s" % (name,repr(xmltext[cursor: cursor+20]))
323
# is it a close bracket?
324
elif xmltext[nextopenbracket+1]=="/":
325
#print "found close bracket", repr(xmltext[nextopenbracket:nextopenbracket+20])
326
nextclosebracket = find(xmltext, ">", nextopenbracket)
327
if nextclosebracket<nextopenbracket:
328
raise ValueError, "unclosed close tag %s" % repr(xmltext[nextopenbracket: nextopenbracket+20])
329
closetagcontents = xmltext[nextopenbracket+2: nextclosebracket]
330
closetaglist = split(closetagcontents)
331
#if len(closetaglist)!=1:
332
#print closetagcontents
333
#raise ValueError, "bad close tag format %s" % repr(xmltext[nextopenbracket: nextopenbracket+20])
335
closename = closetaglist[0]
336
#if verbose: print "closetag name is", closename
338
prefix = xmltext[:cursor]
339
endlinenum = len(split(prefix, "\n"))
340
prefix = xmltext[:startingat]
341
linenum = len(split(prefix, "\n"))
343
"at lines %s...%s close tag name doesn't match %s...%s %s" %(
344
linenum, endlinenum, `name`, `closename`, repr(xmltext[cursor: cursor+100]))
345
remainder = xmltext[cursor:nextopenbracket]
347
#if verbose: print "remainder", repr(remainder)
349
cursor = nextclosebracket+1
350
#print "for", name, "found close tag"
351
docontents = None # done
352
# otherwise we are looking at a new tag, recursively parse it...
353
# first record any intervening content
355
remainder = xmltext[cursor:nextopenbracket]
359
# #print "skipping", repr(remainder)
360
# #print "--- recursively parsing starting at", xmltext[nextopenbracket:nextopenbracket+20]
361
(parsetree, cursor) = parsexml0(xmltext, startingat=nextopenbracket, toplevel=None, entityReplacer=entityReplacer)
364
# maybe should check for trailing garbage?
366
# remainder = strip(xmltext[cursor:])
368
# raise ValueError, "trailing garbage at top level %s" % repr(remainder[:20])
370
if entityReplacer: ContentList = entityReplacer(ContentList)
371
t = (NameString, AttDict, ContentList, ExtraStuff)
375
def pprettyprint(parsedxml):
376
"""pretty printer mainly for testing"""
377
st = types.StringType
378
if type(parsedxml) is st:
380
(name, attdict, textlist, extra) = parsedxml
381
if not attdict: attdict={}
384
for k in attdict.keys():
386
attlist.append("%s=%s" % (k, `v`))
387
attributes = join(attlist, " ")
388
if not name and attributes:
389
raise ValueError, "name missing with attributes???"
390
if textlist is not None:
392
textlistpprint = map(pprettyprint, textlist)
393
textpprint = join(textlistpprint, "\n")
395
return textpprint # no outer tag
397
nllist = string.split(textpprint, "\n")
398
textpprint = " "+join(nllist, "\n ")
399
return "<%s %s>\n%s\n</%s>" % (name, attributes, textpprint, name)
400
# otherwise must be a simple tag
401
return "<%s %s/>" % (name, attributes)
405
from time import time
406
from pprint import pprint
408
D = parsexmlSimple(s)
409
print "DONE", time()-now
414
print "============== reformatting"
419
testparse("""<this type="xml">text <><b>in</b> <funnytag foo="bar"/> xml</this>
422
<this type="xml">text <b>in</b> xml</this> ]]>
423
<tag with="<brackets in values>">just testing brackets feature</tag>
426
filenames = [ #"../../reportlab/demos/pythonpoint/pythonpoint.xml",
427
"samples/hamlet.xml"]
429
#filenames = ["moa.xml"]
432
if __name__=="__main__":
434
from time import time
440
print "elapsed", time()-now