3
class simpleHandler (xml.sax.ContentHandler):
4
"""A simple handler that provides us with indices of marked up content."""
6
self.elements = [] #this will contain a list of elements and their start/end indices
7
self.open_elements = [] #this holds info on open elements while we wait for their close
10
def startElement (self,name,attrs):
11
if name=='foobar': return # we require an outer wrapper, which we promptly ignore.
12
self.open_elements.append({'name':name,
14
'start':len(self.content),
17
def endElement (self, name):
18
if name=='foobar': return # we require an outer wrapper, which we promptly ignore.
19
for i in range(len(self.open_elements)):
20
e = self.open_elements[i]
22
# append a (start,end), name, attrs
23
self.elements.append(((e['start'], #start position
24
len(self.content)),# current (end) position
27
del self.open_elements[i]
30
def characters (self, chunk):
33
class MarkupString (str):
34
"""A simple class for dealing with marked up strings. When we are sliced, we return
35
valid marked up strings, preserving markup."""
36
def __init__ (self, string):
37
str.__init__(self,string)
38
self.handler = simpleHandler()
40
xml.sax.parseString("<foobar>%s</foobar>"%str(string),self.handler)
42
print 'Unable to parse "%s"'%string
44
self.raw=self.handler.content
46
def __getitem__ (self, n):
47
return self.__getslice__(n,n+1)
49
def __getslice__ (self, s, e):
50
# only include relevant elements
51
if not e or e > len(self.raw): e = len(self.raw)
52
elements = filter(lambda tp: (tp[0][1] >= s and # end after the start...
53
tp[0][0] <= e # and start before the end
55
self.handler.elements)
59
# cycle through elements that effect our slice and keep track of
60
# where their start and end tags should go.
64
# write our start tag <stag att="val"...>
66
for k,v in attrs.items(): stag += " %s=%s"%(k,xml.sax.saxutils.quoteattr(v))
68
etag = "</%s>"%name # simple end tag
73
if epos != spos: # we don't care about tags that don't markup any text
74
if not starts.has_key(spos): starts[spos]=[]
75
starts[spos].append(stag)
76
if not ends.has_key(epos): ends[epos]=[]
77
ends[epos].append(etag)
78
outbuf = "" # our actual output string
79
for pos in range(s,e): # we move through positions
81
if ends.has_key(pos): # if there are endtags to insert...
82
for et in ends[pos]: outbuf += et
83
if starts.has_key(pos): # if there are start tags to insert
84
mystarts = starts[pos]
85
# reverse these so the order works out,e.g. <i><b><u></u></b></i>
87
for st in mystarts: outbuf += st
90
for et in ends[e]: outbuf+= et
91
return MarkupString(str(outbuf)) # the str call is necessary to avoid unicode messiness