11
11
structure. An ill-formed XML/HTML document yields a correspondingly
12
12
ill-formed data structure. If your document is only locally
13
13
well-formed, you can use this library to find and process the
14
well-formed part of it. The BeautifulSoup class
14
well-formed part of it.
16
16
Beautiful Soup works with Python 2.2 and up. It has no external
17
17
dependencies, but you'll have more success at converting data to UTF-8
24
24
http://cjkpython.i18n.org/
26
26
Beautiful Soup defines classes for two main parsing strategies:
28
28
* BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29
29
language that kind of looks like XML.
41
41
http://www.crummy.com/software/BeautifulSoup/documentation.html
43
Here, have some legalese:
45
Copyright (c) 2004-2007, Leonard Richardson
49
Redistribution and use in source and binary forms, with or without
50
modification, are permitted provided that the following conditions are
53
* Redistributions of source code must retain the above copyright
54
notice, this list of conditions and the following disclaimer.
56
* Redistributions in binary form must reproduce the above
57
copyright notice, this list of conditions and the following
58
disclaimer in the documentation and/or other materials provided
59
with the distribution.
61
* Neither the name of the the Beautiful Soup Consortium and All
62
Night Kosher Bakery nor the names of its contributors may be
63
used to endorse or promote products derived from this software
64
without specific prior written permission.
66
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
44
79
from __future__ import generators
46
81
__author__ = "Leonard Richardson (leonardr@segfault.org)"
48
83
__copyright__ = "Copyright (c) 2004-2007 Leonard Richardson"
84
__license__ = "New-style BSD"
51
86
from sgmllib import SGMLParser, SGMLParseError
81
116
self.previousSibling = self.parent.contents[-1]
82
117
self.previousSibling.nextSibling = self
84
def replaceWith(self, replaceWith):
119
def replaceWith(self, replaceWith):
85
120
oldParent = self.parent
86
121
myIndex = self.parent.contents.index(self)
87
122
if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
92
127
# means that when we extract it, the index of this
93
128
# element will change.
94
129
myIndex = myIndex - 1
96
131
oldParent.insert(myIndex, replaceWith)
98
133
def extract(self):
99
"""Destructively rips this element out of the tree."""
134
"""Destructively rips this element out of the tree."""
102
137
self.parent.contents.remove(self)
116
151
self.previous = None
117
152
lastChild.next = None
120
155
if self.previousSibling:
121
156
self.previousSibling.nextSibling = self.nextSibling
122
157
if self.nextSibling:
123
158
self.nextSibling.previousSibling = self.previousSibling
124
self.previousSibling = self.nextSibling = None
159
self.previousSibling = self.nextSibling = None
126
161
def _lastRecursiveChild(self):
127
162
"Finds the last element beneath this object to be parsed."
134
169
if (isinstance(newChild, basestring)
135
170
or isinstance(newChild, unicode)) \
136
171
and not isinstance(newChild, NavigableString):
137
newChild = NavigableString(newChild)
172
newChild = NavigableString(newChild)
139
174
position = min(position, len(self.contents))
140
175
if hasattr(newChild, 'parent') and newChild.parent != None:
141
176
# We're 'inserting' an element that's already one
142
# of this object's children.
177
# of this object's children.
143
178
if newChild.parent == self:
144
179
index = self.find(newChild)
145
180
if index and index < position:
161
196
newChild.previousSibling.nextSibling = newChild
162
197
newChild.previous = previousChild._lastRecursiveChild()
163
198
if newChild.previous:
164
newChild.previous.next = newChild
199
newChild.previous.next = newChild
166
201
newChildsLastElement = newChild._lastRecursiveChild()
168
203
if position >= len(self.contents):
169
204
newChild.nextSibling = None
172
207
parentsNextSibling = None
173
208
while not parentsNextSibling:
181
216
newChildsLastElement.next = None
183
nextChild = self.contents[position]
184
newChild.nextSibling = nextChild
218
nextChild = self.contents[position]
219
newChild.nextSibling = nextChild
185
220
if newChild.nextSibling:
186
221
newChild.nextSibling.previousSibling = newChild
187
222
newChildsLastElement.next = nextChild
190
225
newChildsLastElement.next.previous = newChildsLastElement
191
226
self.contents.insert(position, newChild)
228
def append(self, tag):
229
"""Appends the given tag to the contents of this tag."""
230
self.insert(len(self.contents), tag)
193
232
def findNext(self, name=None, attrs={}, text=None, **kwargs):
194
233
"""Returns the first item that matches the given criteria and
195
234
appears after this Tag in the document."""
328
367
# Utility methods
329
368
def substituteEncoding(self, str, encoding=None):
330
369
encoding = encoding or "utf-8"
331
return str.replace("%SOUP-ENCODING%", encoding)
370
return str.replace("%SOUP-ENCODING%", encoding)
333
372
def toEncoding(self, s, encoding=None):
334
373
"""Encodes an object to a string in some encoding, or to Unicode.
351
390
class NavigableString(unicode, PageElement):
392
def __getnewargs__(self):
393
return (NavigableString.__str__(self),)
353
395
def __getattr__(self, attr):
354
396
"""text.string gives you text. This is for backwards
355
397
compatibility for Navigable*String, but for CData* it lets you
360
402
raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
362
404
def __unicode__(self):
363
return self.__str__(None)
405
return unicode(str(self))
365
407
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
367
409
return self.encode(encoding)
371
413
class CData(NavigableString):
373
415
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
383
425
class Comment(NavigableString):
384
426
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
385
return "<!--%s-->" % NavigableString.__str__(self, encoding)
427
return "<!--%s-->" % NavigableString.__str__(self, encoding)
387
429
class Declaration(NavigableString):
388
430
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
389
return "<!%s>" % NavigableString.__str__(self, encoding)
431
return "<!%s>" % NavigableString.__str__(self, encoding)
391
433
class Tag(PageElement):
393
435
"""Represents a found HTML tag with its attributes and contents."""
395
XML_SPECIAL_CHARS_TO_ENTITIES = { "'" : "squot",
438
"Cheap function to invert a hash."
440
for k,v in h.items():
444
XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
450
XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
452
def _convertEntities(self, match):
453
"""Used in a call to re.sub to replace HTML, XML, and numeric
454
entities with the appropriate Unicode characters. If HTML
455
entities are being converted, any unrecognized entities are
458
if self.convertHTMLEntities and x in name2codepoint:
459
return unichr(name2codepoint[x])
460
elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
461
if self.convertXMLEntities:
462
return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
465
elif len(x) > 0 and x[0] == '#':
466
# Handle numeric entities
467
if len(x) > 1 and x[1] == 'x':
468
return unichr(int(x[2:], 16))
470
return unichr(int(x[1:]))
472
elif self.escapeUnrecognizedEntities:
473
return u'&%s;' % x
401
477
def __init__(self, parser, name, attrs=None, parent=None,
414
490
self.setup(parent, previous)
415
491
self.hidden = False
416
492
self.containsSubstitutions = False
493
self.convertHTMLEntities = parser.convertHTMLEntities
494
self.convertXMLEntities = parser.convertXMLEntities
495
self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
497
# Convert any HTML, XML, or numeric entities in the attribute values.
498
convert = lambda(k, val): (k,
499
re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
500
self._convertEntities,
502
self.attrs = map(convert, self.attrs)
418
504
def get(self, key, default=None):
419
505
"""Returns the value of the 'key' attribute for the tag, or
420
506
the value given for 'default' if it doesn't have that
422
return self._getAttrMap().get(key, default)
508
return self._getAttrMap().get(key, default)
424
510
def has_key(self, key):
425
511
return self._getAttrMap().has_key(key)
444
530
"A tag is non-None even if it has no contents."
447
def __setitem__(self, key, value):
533
def __setitem__(self, key, value):
448
534
"""Setting tag[key] sets the value of the 'key' attribute for the
450
536
self._getAttrMap()
481
567
return self.find(tag[:-3])
482
568
elif tag.find('__') != 0:
483
569
return self.find(tag)
570
raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
485
572
def __eq__(self, other):
486
573
"""Returns true iff this tag has the same name, the same attributes,
507
594
def __unicode__(self):
508
595
return self.__str__(None)
597
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
598
+ "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
601
def _sub_entity(self, x):
602
"""Used with a regular expression to substitute the
603
appropriate XML entity for an XML special character."""
604
return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
510
606
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
511
607
prettyPrint=False, indentLevel=0):
512
608
"""Returns a string or Unicode representation of this tag and
543
639
# embedded single quotes to XML entities.
546
# This can't happen naturally, but it can happen
547
# if you modify an attribute value after parsing.
643
# TODO: replace with apos when
549
645
val = val.replace("'", "&squot;")
551
647
# Now we're okay w/r/t quotes. But the attribute
552
648
# value might also contain angle brackets, or
553
649
# ampersands that aren't part of entities. We need
554
650
# to escape those to XML entities too.
555
val = re.sub("([<>]|&(?![^\s]+;))",
556
lambda x: "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";",
651
val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
559
653
attrs.append(fmt % (self.toEncoding(key, encoding),
560
654
self.toEncoding(val, encoding)))
651
745
# Pre-3.x compatibility methods
655
749
def fetchText(self, text=None, recursive=True, limit=None):
656
750
return self.findAll(text=text, recursive=recursive, limit=limit)
658
752
def firstText(self, text=None, recursive=True):
659
753
return self.find(text=text, recursive=recursive)
663
def append(self, tag):
664
"""Appends the given tag to the contents of this tag."""
665
self.contents.append(tag)
762
850
#print 'looking for %s in %s' % (self, markup)
764
852
# If given a list of items, scan it for a text element that
766
854
if isList(markup) and not isinstance(markup, Tag):
767
855
for element in markup:
768
856
if isinstance(element, NavigableString) \
783
871
raise Exception, "I don't know how to match against a %s" \
784
872
% markup.__class__
787
def _matches(self, markup, matchAgainst):
875
def _matches(self, markup, matchAgainst):
788
876
#print "Matching %s against %s" % (markup, matchAgainst)
790
878
if matchAgainst == True and type(matchAgainst) == types.BooleanType:
835
923
"""Convenience method that works with all 2.x versions of Python
836
924
to determine whether or not something is stringlike."""
838
return isinstance(s, unicode) or isintance(s, basestring)
926
return isinstance(s, unicode) or isinstance(s, basestring)
839
927
except NameError:
840
928
return isinstance(s, str)
865
953
"""This class contains the basic parser and search code. It defines
866
954
a parser that knows nothing about tag behavior except for the
869
957
You can't close a tag without closing all the tags it encloses.
870
958
That is, "<foo><bar></foo>" actually means
871
959
"<foo><bar></bar></foo>".
898
982
HTML_ENTITIES = "html"
899
983
XML_ENTITIES = "xml"
984
XHTML_ENTITIES = "xhtml"
985
# TODO: This only exists for backwards-compatibility
986
ALL_ENTITIES = XHTML_ENTITIES
988
# Used when determining whether a text node is all whitespace and
989
# can be replaced with a single space. A text node that contains
990
# fancy Unicode spaces (usually non-breaking) should be left
992
STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
901
994
def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
902
995
markupMassage=True, smartQuotesTo=XML_ENTITIES,
903
996
convertEntities=None, selfClosingTags=None):
904
997
"""The Soup object is initialized as the 'root tag', and the
905
998
provided markup (which can be a string or a file-like object)
906
is fed into the underlying parser.
999
is fed into the underlying parser.
908
1001
sgmllib will process most bad HTML, and the BeautifulSoup
909
1002
class has some tricks for dealing with some HTML that kills
930
1023
self.fromEncoding = fromEncoding
931
1024
self.smartQuotesTo = smartQuotesTo
932
1025
self.convertEntities = convertEntities
1026
# Set the rules for how we'll deal with the entities we
933
1028
if self.convertEntities:
934
1029
# It doesn't make sense to convert encoded characters to
935
1030
# entities even while you're converting entities to Unicode.
936
1031
# Just convert it all to Unicode.
937
1032
self.smartQuotesTo = None
1033
if convertEntities == self.HTML_ENTITIES:
1034
self.convertXMLEntities = False
1035
self.convertHTMLEntities = True
1036
self.escapeUnrecognizedEntities = True
1037
elif convertEntities == self.XHTML_ENTITIES:
1038
self.convertXMLEntities = True
1039
self.convertHTMLEntities = True
1040
self.escapeUnrecognizedEntities = False
1041
elif convertEntities == self.XML_ENTITIES:
1042
self.convertXMLEntities = True
1043
self.convertHTMLEntities = False
1044
self.escapeUnrecognizedEntities = False
1046
self.convertXMLEntities = False
1047
self.convertHTMLEntities = False
1048
self.escapeUnrecognizedEntities = False
938
1050
self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
939
1051
SGMLParser.__init__(self)
941
1053
if hasattr(markup, 'read'): # It's a file-type object.
942
1054
markup = markup.read()
943
1055
self.markup = markup
947
1059
except StopParsing:
949
1061
self.markup = None # The markup can now be GCed
1063
def convert_charref(self, name):
1064
"""This method fixes a bug in Python's SGMLParser."""
1069
if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1071
return self.convert_codepoint(n)
951
1073
def _feed(self, inDocumentEncoding=None):
952
1074
# Convert the document to Unicode.
953
1075
markup = self.markup
964
1086
if self.markupMassage:
965
1087
if not isList(self.markupMassage):
966
self.markupMassage = self.MARKUP_MASSAGE
1088
self.markupMassage = self.MARKUP_MASSAGE
967
1089
for fix, m in self.markupMassage:
968
1090
markup = fix.sub(m, markup)
1091
# TODO: We get rid of markupMassage so that the
1092
# soup object can be deepcopied later on. Some
1093
# Python installations can't copy regexes. If anyone
1094
# was relying on the existence of markupMassage, this
1095
# might cause problems.
1096
del(self.markupMassage)
971
1099
SGMLParser.feed(self, markup)
1020
1148
def pushTag(self, tag):
1021
1149
#print "Push", tag.name
1022
1150
if self.currentTag:
1023
self.currentTag.append(tag)
1151
self.currentTag.contents.append(tag)
1024
1152
self.tagStack.append(tag)
1025
1153
self.currentTag = self.tagStack[-1]
1027
1155
def endData(self, containerClass=NavigableString):
1028
1156
if self.currentData:
1029
1157
currentData = ''.join(self.currentData)
1030
if not currentData.strip():
1158
if not currentData.translate(self.STRIP_ASCII_SPACES):
1031
1159
if '\n' in currentData:
1032
1160
currentData = '\n'
1076
1204
comes between this tag and the previous tag of this type.
1079
<p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1080
<p>Foo<table>Bar<p> should pop to 'table', not 'p'.
1081
<p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
1082
<p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1207
<p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1208
<p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1209
<p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1084
1211
<li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1085
1212
<tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1102
1229
and p.name in nestingResetTriggers) \
1103
1230
or (nestingResetTriggers == None and isResetNesting
1104
1231
and self.RESET_NESTING_TAGS.has_key(p.name)):
1106
1233
#If we encounter one of the nesting reset triggers
1107
1234
#peculiar to this tag, or we encounter another tag
1108
1235
#that causes nesting to reset, pop up to but not
1121
1248
#print "<%s> is not real!" % name
1122
1249
attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1123
1250
self.handle_data('<%s%s>' % (name, attrs))
1127
1254
if not self.isSelfClosingTag(name) and not selfClosing:
1172
1299
object, possibly one with a %SOUP-ENCODING% slot into which an
1173
1300
encoding will be plugged later."""
1174
1301
if text[:3] == "xml":
1175
text = "xml version='1.0' encoding='%SOUP-ENCODING%'"
1302
text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1176
1303
self._toStringSubclass(text, ProcessingInstruction)
1178
1305
def handle_comment(self, text):
1191
1317
def handle_entityref(self, ref):
1192
1318
"""Handle entity references as data, possibly converting known
1193
HTML entity references to the corresponding Unicode
1319
HTML and/or XML entity references to the corresponding Unicode
1196
if self.convertEntities == self.HTML_ENTITIES or \
1197
(self.convertEntities == self.XML_ENTITIES and \
1198
self.XML_ENTITY_LIST.get(ref)):
1322
if self.convertHTMLEntities:
1200
1324
data = unichr(name2codepoint[ref])
1201
1325
except KeyError:
1328
if not data and self.convertXMLEntities:
1329
data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1331
if not data and self.convertHTMLEntities and \
1332
not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1333
# TODO: We've got a problem here. We're told this is
1334
# an entity reference, but it's not an XML entity
1335
# reference or an HTML entity reference. Nonetheless,
1336
# the logical thing to do is to pass it through as an
1337
# unrecognized entity reference.
1339
# Except: when the input is "&carol;" this function
1340
# will be called with input "carol". When the input is
1341
# "AT&T", this function will be called with input
1342
# "T". We have no way of knowing whether a semicolon
1343
# was present originally, so we don't know whether
1344
# this is an unknown entity or just a misplaced
1347
# The more common case is a misplaced ampersand, so I
1348
# escape the ampersand and omit the trailing semicolon.
1349
data = "&%s" % ref
1351
# This case is different from the one above, because we
1352
# haven't already gone through a supposedly comprehensive
1353
# mapping of entities to Unicode characters. We might not
1354
# have gone through any mapping at all. So the chances are
1355
# very high that this is a real entity, and not a
1356
# misplaced ampersand.
1205
1358
self.handle_data(data)
1207
1360
def handle_decl(self, data):
1208
1361
"Handle DOCTYPEs and the like as Declaration objects."
1209
1362
self._toStringSubclass(data, Declaration)
1285
1438
['br' , 'hr', 'input', 'img', 'meta',
1286
1439
'spacer', 'link', 'frame', 'base'])
1288
QUOTE_TAGS = {'script': None}
1441
QUOTE_TAGS = {'script' : None, 'textarea' : None}
1290
1443
#According to the HTML standard, each of these inline tags can
1291
1444
#contain another tag of the same type. Furthermore, it's common
1292
1445
#to actually use these tags this way.
1298
1451
#to actually use these tags this way.
1299
1452
NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
1301
#Lists can contain other lists, but there are restrictions.
1454
#Lists can contain other lists, but there are restrictions.
1302
1455
NESTABLE_LIST_TAGS = { 'ol' : [],
1304
1457
'li' : ['ul', 'ol'],
1307
1460
'dt' : ['dl'] }
1309
#Tables can contain other tables, but there are restrictions.
1310
NESTABLE_TABLE_TAGS = {'table' : [],
1462
#Tables can contain other tables, but there are restrictions.
1463
NESTABLE_TABLE_TAGS = {'table' : [],
1311
1464
'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1453
1606
parent = self.tagStack[-2]
1454
1607
parent._getAttrMap()
1455
1608
if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1456
isinstance(tag.contents[0], NavigableString) and
1609
isinstance(tag.contents[0], NavigableString) and
1457
1610
not parent.attrMap.has_key(tag.name)):
1458
1611
parent[tag.name] = tag.contents[0]
1459
1612
BeautifulStoneSoup.popTag(self)
1463
1616
#and "unprofessional" for use in enterprise screen-scraping. We feel
1464
1617
#your pain! For such-minded folk, the Beautiful Soup Consortium And
1465
1618
#All-Night Kosher Bakery recommends renaming this file to
1466
#"RobustParser.py" (or, in cases of extreme enterprisitude,
1619
#"RobustParser.py" (or, in cases of extreme enterprisiness,
1467
1620
#"RobustParserBeanInterface.class") and using the following
1468
1621
#enterprise-friendly class aliases:
1469
1622
class RobustXMLParser(BeautifulStoneSoup):
1530
1682
self.triedEncodings = []
1531
1683
if markup == '' or isinstance(markup, unicode):
1532
1684
self.originalEncoding = None
1533
self.unicode = unicode(markup)
1685
self.unicode = unicode(markup)
1537
1689
for proposedEncoding in overrideEncodings:
1538
1690
u = self._convertFrom(proposedEncoding)
1541
1693
for proposedEncoding in (documentEncoding, sniffedEncoding):
1542
1694
u = self._convertFrom(proposedEncoding)
1545
1697
# If no luck and we have auto-detection library, try that:
1546
1698
if not u and chardet and not isinstance(self.markup, unicode):
1547
1699
u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1551
1703
for proposed_encoding in ("utf-8", "windows-1252"):
1552
1704
u = self._convertFrom(proposed_encoding)
1707
self.originalEncoding = None
1554
1709
self.unicode = u
1555
if not u: self.originalEncoding = None
1557
1711
def _subMSChar(self, orig):
1558
1712
"""Changes a MS smart quote character to an XML or HTML
1585
1739
# print "Trying to convert document to %s" % proposed
1586
1740
u = self._toUnicode(markup, proposed)
1588
1742
self.originalEncoding = proposed
1589
1743
except Exception, e:
1590
1744
# print "That didn't work!"
1593
1747
#print "Correct encoding: %s" % proposed
1594
1748
return self.markup
1617
1771
data = data[4:]
1618
1772
newdata = unicode(data, encoding)
1621
1775
def _detectEncoding(self, xml_data):
1622
1776
"""Given a document, tries to detect its XML encoding."""
1623
1777
xml_encoding = sniffed_xml_encoding = None