3
"The Screen-Scraper's Friend"
4
http://www.crummy.com/software/BeautifulSoup/
6
Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7
tree representation. It provides methods and Pythonic idioms that make
8
it easy to navigate, search, and modify the tree.
10
A well-formed XML/HTML document yields a well-formed data
11
structure. An ill-formed XML/HTML document yields a correspondingly
12
ill-formed data structure. If your document is only locally
13
well-formed, you can use this library to find and process the
14
well-formed part of it. The BeautifulSoup class
16
Beautiful Soup works with Python 2.2 and up. It has no external
17
dependencies, but you'll have more success at converting data to UTF-8
18
if you also install these three packages:
20
* chardet, for auto-detecting character encodings
21
http://chardet.feedparser.org/
22
* cjkcodecs and iconv_codec, which add more encodings to the ones supported
24
http://cjkpython.i18n.org/
26
Beautiful Soup defines classes for two main parsing strategies:
28
* BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29
language that kind of looks like XML.
31
* BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32
or invalid. This class has web browser-like heuristics for
33
obtaining a sensible parse tree in the face of common HTML errors.
35
Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36
the encoding of an HTML or XML document, and converting it to
37
Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
39
For more than you ever wanted to know about Beautiful Soup, see the
41
http://www.crummy.com/software/BeautifulSoup/documentation.html
44
from __future__ import generators
46
__author__ = "Leonard Richardson (leonardr@segfault.org)"
48
__copyright__ = "Copyright (c) 2004-2007 Leonard Richardson"
51
from sgmllib import SGMLParser, SGMLParseError
57
from htmlentitydefs import name2codepoint
61
#This hack makes Beautiful Soup able to parse XML with namespaces
62
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
64
DEFAULT_OUTPUT_ENCODING = "utf-8"
66
# First, the classes that represent markup elements.
69
"""Contains the navigational information for some part of the page
70
(either a tag or a piece of text)"""
72
def setup(self, parent=None, previous=None):
73
"""Sets up the initial relations between this element and
76
self.previous = previous
78
self.previousSibling = None
79
self.nextSibling = None
80
if self.parent and self.parent.contents:
81
self.previousSibling = self.parent.contents[-1]
82
self.previousSibling.nextSibling = self
84
def replaceWith(self, replaceWith):
85
oldParent = self.parent
86
myIndex = self.parent.contents.index(self)
87
if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
88
# We're replacing this element with one of its siblings.
89
index = self.parent.contents.index(replaceWith)
90
if index and index < myIndex:
91
# Furthermore, it comes before this element. That
92
# means that when we extract it, the index of this
93
# element will change.
96
oldParent.insert(myIndex, replaceWith)
99
"""Destructively rips this element out of the tree."""
102
self.parent.contents.remove(self)
106
#Find the two elements that would be next to each other if
107
#this element (and any children) hadn't been parsed. Connect
109
lastChild = self._lastRecursiveChild()
110
nextElement = lastChild.next
113
self.previous.next = nextElement
115
nextElement.previous = self.previous
117
lastChild.next = None
120
if self.previousSibling:
121
self.previousSibling.nextSibling = self.nextSibling
123
self.nextSibling.previousSibling = self.previousSibling
124
self.previousSibling = self.nextSibling = None
126
def _lastRecursiveChild(self):
127
"Finds the last element beneath this object to be parsed."
129
while hasattr(lastChild, 'contents') and lastChild.contents:
130
lastChild = lastChild.contents[-1]
133
def insert(self, position, newChild):
134
if (isinstance(newChild, basestring)
135
or isinstance(newChild, unicode)) \
136
and not isinstance(newChild, NavigableString):
137
newChild = NavigableString(newChild)
139
position = min(position, len(self.contents))
140
if hasattr(newChild, 'parent') and newChild.parent != None:
141
# We're 'inserting' an element that's already one
142
# of this object's children.
143
if newChild.parent == self:
144
index = self.find(newChild)
145
if index and index < position:
146
# Furthermore we're moving it further down the
147
# list of this object's children. That means that
148
# when we extract this element, our target index
149
# will jump down one.
150
position = position - 1
153
newChild.parent = self
156
newChild.previousSibling = None
157
newChild.previous = self
159
previousChild = self.contents[position-1]
160
newChild.previousSibling = previousChild
161
newChild.previousSibling.nextSibling = newChild
162
newChild.previous = previousChild._lastRecursiveChild()
163
if newChild.previous:
164
newChild.previous.next = newChild
166
newChildsLastElement = newChild._lastRecursiveChild()
168
if position >= len(self.contents):
169
newChild.nextSibling = None
172
parentsNextSibling = None
173
while not parentsNextSibling:
174
parentsNextSibling = parent.nextSibling
175
parent = parent.parent
176
if not parent: # This is the last element in the document.
178
if parentsNextSibling:
179
newChildsLastElement.next = parentsNextSibling
181
newChildsLastElement.next = None
183
nextChild = self.contents[position]
184
newChild.nextSibling = nextChild
185
if newChild.nextSibling:
186
newChild.nextSibling.previousSibling = newChild
187
newChildsLastElement.next = nextChild
189
if newChildsLastElement.next:
190
newChildsLastElement.next.previous = newChildsLastElement
191
self.contents.insert(position, newChild)
193
def findNext(self, name=None, attrs={}, text=None, **kwargs):
194
"""Returns the first item that matches the given criteria and
195
appears after this Tag in the document."""
196
return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
198
def findAllNext(self, name=None, attrs={}, text=None, limit=None,
200
"""Returns all items that match the given criteria and appear
201
before after Tag in the document."""
202
return self._findAll(name, attrs, text, limit, self.nextGenerator)
204
def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
205
"""Returns the closest sibling to this Tag that matches the
206
given criteria and appears after this Tag in the document."""
207
return self._findOne(self.findNextSiblings, name, attrs, text,
210
def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
212
"""Returns the siblings of this Tag that match the given
213
criteria and appear after this Tag in the document."""
214
return self._findAll(name, attrs, text, limit,
215
self.nextSiblingGenerator, **kwargs)
216
fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
218
def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
219
"""Returns the first item that matches the given criteria and
220
appears before this Tag in the document."""
221
return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
223
def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
225
"""Returns all items that match the given criteria and appear
226
before this Tag in the document."""
227
return self._findAll(name, attrs, text, limit, self.previousGenerator,
229
fetchPrevious = findAllPrevious # Compatibility with pre-3.x
231
def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
232
"""Returns the closest sibling to this Tag that matches the
233
given criteria and appears before this Tag in the document."""
234
return self._findOne(self.findPreviousSiblings, name, attrs, text,
237
def findPreviousSiblings(self, name=None, attrs={}, text=None,
238
limit=None, **kwargs):
239
"""Returns the siblings of this Tag that match the given
240
criteria and appear before this Tag in the document."""
241
return self._findAll(name, attrs, text, limit,
242
self.previousSiblingGenerator, **kwargs)
243
fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
245
def findParent(self, name=None, attrs={}, **kwargs):
246
"""Returns the closest parent of this Tag that matches the given
248
# NOTE: We can't use _findOne because findParents takes a different
251
l = self.findParents(name, attrs, 1)
256
def findParents(self, name=None, attrs={}, limit=None, **kwargs):
257
"""Returns the parents of this Tag that match the given
260
return self._findAll(name, attrs, None, limit, self.parentGenerator,
262
fetchParents = findParents # Compatibility with pre-3.x
264
#These methods do the real heavy lifting.
266
def _findOne(self, method, name, attrs, text, **kwargs):
268
l = method(name, attrs, text, 1, **kwargs)
273
def _findAll(self, name, attrs, text, limit, generator, **kwargs):
274
"Iterates over a generator looking for things that match."
276
if isinstance(name, SoupStrainer):
279
# Build a SoupStrainer
280
strainer = SoupStrainer(name, attrs, text, **kwargs)
281
results = ResultSet(strainer)
286
except StopIteration:
289
found = strainer.search(i)
291
results.append(found)
292
if limit and len(results) >= limit:
296
#These Generators can be used to navigate starting from both
297
#NavigableStrings and Tags.
298
def nextGenerator(self):
304
def nextSiblingGenerator(self):
310
def previousGenerator(self):
316
def previousSiblingGenerator(self):
319
i = i.previousSibling
322
def parentGenerator(self):
329
def substituteEncoding(self, str, encoding=None):
330
encoding = encoding or "utf-8"
331
return str.replace("%SOUP-ENCODING%", encoding)
333
def toEncoding(self, s, encoding=None):
334
"""Encodes an object to a string in some encoding, or to Unicode.
336
if isinstance(s, unicode):
338
s = s.encode(encoding)
339
elif isinstance(s, str):
341
s = s.encode(encoding)
346
s = self.toEncoding(str(s), encoding)
351
class NavigableString(unicode, PageElement):
353
def __getattr__(self, attr):
354
"""text.string gives you text. This is for backwards
355
compatibility for Navigable*String, but for CData* it lets you
356
get the string without the CData wrapper."""
360
raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
362
def __unicode__(self):
363
return self.__str__(None)
365
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
367
return self.encode(encoding)
371
class CData(NavigableString):
373
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
374
return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
376
class ProcessingInstruction(NavigableString):
377
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
379
if "%SOUP-ENCODING%" in output:
380
output = self.substituteEncoding(output, encoding)
381
return "<?%s?>" % self.toEncoding(output, encoding)
383
class Comment(NavigableString):
384
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
385
return "<!--%s-->" % NavigableString.__str__(self, encoding)
387
class Declaration(NavigableString):
388
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
389
return "<!%s>" % NavigableString.__str__(self, encoding)
391
class Tag(PageElement):
393
"""Represents a found HTML tag with its attributes and contents."""
395
XML_SPECIAL_CHARS_TO_ENTITIES = { "'" : "squot",
401
def __init__(self, parser, name, attrs=None, parent=None,
405
# We don't actually store the parser object: that lets extracted
406
# chunks be garbage-collected
407
self.parserClass = parser.__class__
408
self.isSelfClosing = parser.isSelfClosingTag(name)
414
self.setup(parent, previous)
416
self.containsSubstitutions = False
418
def get(self, key, default=None):
419
"""Returns the value of the 'key' attribute for the tag, or
420
the value given for 'default' if it doesn't have that
422
return self._getAttrMap().get(key, default)
424
def has_key(self, key):
425
return self._getAttrMap().has_key(key)
427
def __getitem__(self, key):
428
"""tag[key] returns the value of the 'key' attribute for the tag,
429
and throws an exception if it's not there."""
430
return self._getAttrMap()[key]
433
"Iterating over a tag iterates over its contents."
434
return iter(self.contents)
437
"The length of a tag is the length of its list of contents."
438
return len(self.contents)
440
def __contains__(self, x):
441
return x in self.contents
443
def __nonzero__(self):
444
"A tag is non-None even if it has no contents."
447
def __setitem__(self, key, value):
448
"""Setting tag[key] sets the value of the 'key' attribute for the
451
self.attrMap[key] = value
453
for i in range(0, len(self.attrs)):
454
if self.attrs[i][0] == key:
455
self.attrs[i] = (key, value)
458
self.attrs.append((key, value))
459
self._getAttrMap()[key] = value
461
def __delitem__(self, key):
462
"Deleting tag[key] deletes all 'key' attributes for the tag."
463
for item in self.attrs:
465
self.attrs.remove(item)
466
#We don't break because bad HTML can define the same
467
#attribute multiple times.
469
if self.attrMap.has_key(key):
470
del self.attrMap[key]
472
def __call__(self, *args, **kwargs):
473
"""Calling a tag like a function is the same as calling its
474
findAll() method. Eg. tag('a') returns a list of all the A tags
475
found within this tag."""
476
return apply(self.findAll, args, kwargs)
478
def __getattr__(self, tag):
479
#print "Getattr %s.%s" % (self.__class__, tag)
480
if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
481
return self.find(tag[:-3])
482
elif tag.find('__') != 0:
483
return self.find(tag)
485
def __eq__(self, other):
486
"""Returns true iff this tag has the same name, the same attributes,
487
and the same contents (recursively) as the given tag.
489
NOTE: right now this will return false if two tags have the
490
same attributes in a different order. Should this be fixed?"""
491
if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
493
for i in range(0, len(self.contents)):
494
if self.contents[i] != other.contents[i]:
498
def __ne__(self, other):
499
"""Returns true iff this tag is not identical to the other tag,
500
as defined in __eq__."""
501
return not self == other
503
def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
504
"""Renders this tag as a string."""
505
return self.__str__(encoding)
507
def __unicode__(self):
508
return self.__str__(None)
510
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
511
prettyPrint=False, indentLevel=0):
512
"""Returns a string or Unicode representation of this tag and
513
its contents. To get Unicode, pass None for encoding.
515
NOTE: since Python's HTML parser consumes whitespace, this
516
method is not certain to reproduce the whitespace present in
517
the original string."""
519
encodedName = self.toEncoding(self.name, encoding)
523
for key, val in self.attrs:
526
if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
527
val = self.substituteEncoding(val, encoding)
529
# The attribute value either:
531
# * Contains no embedded double quotes or single quotes.
532
# No problem: we enclose it in double quotes.
533
# * Contains embedded single quotes. No problem:
534
# double quotes work here too.
535
# * Contains embedded double quotes. No problem:
536
# we enclose it in single quotes.
537
# * Embeds both single _and_ double quotes. This
538
# can't happen naturally, but it can happen if
539
# you modify an attribute value after parsing
540
# the document. Now we have a bit of a
541
# problem. We solve it by enclosing the
542
# attribute in single quotes, and escaping any
543
# embedded single quotes to XML entities.
546
# This can't happen naturally, but it can happen
547
# if you modify an attribute value after parsing.
549
val = val.replace("'", "&squot;")
551
# Now we're okay w/r/t quotes. But the attribute
552
# value might also contain angle brackets, or
553
# ampersands that aren't part of entities. We need
554
# to escape those to XML entities too.
555
val = re.sub("([<>]|&(?![^\s]+;))",
556
lambda x: "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";",
559
attrs.append(fmt % (self.toEncoding(key, encoding),
560
self.toEncoding(val, encoding)))
563
if self.isSelfClosing:
566
closeTag = '</%s>' % encodedName
568
indentTag, indentContents = 0, 0
570
indentTag = indentLevel
571
space = (' ' * (indentTag-1))
572
indentContents = indentTag + 1
573
contents = self.renderContents(encoding, prettyPrint, indentContents)
580
attributeString = ' ' + ' '.join(attrs)
583
s.append('<%s%s%s>' % (encodedName, attributeString, close))
587
if prettyPrint and contents and contents[-1] != "\n":
589
if prettyPrint and closeTag:
592
if prettyPrint and closeTag and self.nextSibling:
597
def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
598
return self.__str__(encoding, True)
600
def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
601
prettyPrint=False, indentLevel=0):
602
"""Renders the contents of this tag as a string in the given
603
encoding. If encoding is None, returns a Unicode string.."""
607
if isinstance(c, NavigableString):
608
text = c.__str__(encoding)
609
elif isinstance(c, Tag):
610
s.append(c.__str__(encoding, prettyPrint, indentLevel))
611
if text and prettyPrint:
615
s.append(" " * (indentLevel-1))
623
def find(self, name=None, attrs={}, recursive=True, text=None,
625
"""Return only the first child of this Tag matching the given
628
l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
634
def findAll(self, name=None, attrs={}, recursive=True, text=None,
635
limit=None, **kwargs):
636
"""Extracts a list of Tag objects that match the given
637
criteria. You can specify the name of the Tag and any
638
attributes you want the Tag to have.
640
The value of a key-value pair in the 'attrs' map can be a
641
string, a list of strings, a regular expression object, or a
642
callable that takes a string and returns whether or not the
643
string matches for some custom definition of 'matches'. The
644
same is true of the tag name."""
645
generator = self.recursiveChildGenerator
647
generator = self.childGenerator
648
return self._findAll(name, attrs, text, limit, generator, **kwargs)
649
findChildren = findAll
651
# Pre-3.x compatibility methods
655
def fetchText(self, text=None, recursive=True, limit=None):
656
return self.findAll(text=text, recursive=recursive, limit=limit)
658
def firstText(self, text=None, recursive=True):
659
return self.find(text=text, recursive=recursive)
663
def append(self, tag):
664
"""Appends the given tag to the contents of this tag."""
665
self.contents.append(tag)
669
def _getAttrMap(self):
670
"""Initializes a map representation of this tag's attributes,
671
if not already initialized."""
672
if not getattr(self, 'attrMap'):
674
for (key, value) in self.attrs:
675
self.attrMap[key] = value
679
def childGenerator(self):
680
for i in range(0, len(self.contents)):
681
yield self.contents[i]
684
def recursiveChildGenerator(self):
687
tag, start = stack.pop()
688
if isinstance(tag, Tag):
689
for i in range(start, len(tag.contents)):
692
if isinstance(a, Tag) and tag.contents:
693
if i < len(tag.contents) - 1:
694
stack.append((tag, i+1))
699
# Next, a couple classes to represent queries and their results.
701
"""Encapsulates a number of ways of matching a markup element (tag or
704
def __init__(self, name=None, attrs={}, text=None, **kwargs):
707
kwargs['class'] = attrs
722
return "%s|%s" % (self.name, self.attrs)
724
def searchTag(self, markupName=None, markupAttrs={}):
727
if isinstance(markupName, Tag):
730
callFunctionWithTagData = callable(self.name) \
731
and not isinstance(markupName, Tag)
734
or callFunctionWithTagData \
735
or (markup and self._matches(markup, self.name)) \
736
or (not markup and self._matches(markupName, self.name)):
737
if callFunctionWithTagData:
738
match = self.name(markupName, markupAttrs)
742
for attr, matchAgainst in self.attrs.items():
743
if not markupAttrMap:
744
if hasattr(markupAttrs, 'get'):
745
markupAttrMap = markupAttrs
748
for k,v in markupAttrs:
750
attrValue = markupAttrMap.get(attr)
751
if not self._matches(attrValue, matchAgainst):
761
def search(self, markup):
762
#print 'looking for %s in %s' % (self, markup)
764
# If given a list of items, scan it for a text element that
766
if isList(markup) and not isinstance(markup, Tag):
767
for element in markup:
768
if isinstance(element, NavigableString) \
769
and self.search(element):
772
# If it's a Tag, make sure its name or attributes match.
773
# Don't bother with Tags if we're searching for text.
774
elif isinstance(markup, Tag):
776
found = self.searchTag(markup)
777
# If it's text, make sure the text matches.
778
elif isinstance(markup, NavigableString) or \
780
if self._matches(markup, self.text):
783
raise Exception, "I don't know how to match against a %s" \
787
def _matches(self, markup, matchAgainst):
788
#print "Matching %s against %s" % (markup, matchAgainst)
790
if matchAgainst == True and type(matchAgainst) == types.BooleanType:
791
result = markup != None
792
elif callable(matchAgainst):
793
result = matchAgainst(markup)
795
#Custom match methods take the tag as an argument, but all
796
#other ways of matching match the tag name as a string.
797
if isinstance(markup, Tag):
799
if markup and not isString(markup):
800
markup = unicode(markup)
801
#Now we know that chunk is either a string, or None.
802
if hasattr(matchAgainst, 'match'):
803
# It's a regexp object.
804
result = markup and matchAgainst.search(markup)
805
elif isList(matchAgainst):
806
result = markup in matchAgainst
807
elif hasattr(matchAgainst, 'items'):
808
result = markup.has_key(matchAgainst)
809
elif matchAgainst and isString(markup):
810
if isinstance(markup, unicode):
811
matchAgainst = unicode(matchAgainst)
813
matchAgainst = str(matchAgainst)
816
result = matchAgainst == markup
819
class ResultSet(list):
820
"""A ResultSet is just a list that keeps track of the SoupStrainer
822
def __init__(self, source):
826
# Now, some helper functions.
829
"""Convenience method that works with all 2.x versions of Python
830
to determine whether or not something is listlike."""
831
return hasattr(l, '__iter__') \
832
or (type(l) in (types.ListType, types.TupleType))
835
"""Convenience method that works with all 2.x versions of Python
836
to determine whether or not something is stringlike."""
838
return isinstance(s, unicode) or isintance(s, basestring)
840
return isinstance(s, str)
842
def buildTagMap(default, *args):
843
"""Turns a list of maps, lists, or scalars into a single map.
844
Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
845
NESTING_RESET_TAGS maps out of lists and partial maps."""
848
if hasattr(portion, 'items'):
849
#It's a map. Merge it.
850
for k,v in portion.items():
852
elif isList(portion):
853
#It's a list. Map each item to the default.
857
#It's a scalar. Map it to the default.
858
built[portion] = default
861
# Now, the parser classes.
863
class BeautifulStoneSoup(Tag, SGMLParser):
865
"""This class contains the basic parser and search code. It defines
866
a parser that knows nothing about tag behavior except for the
869
You can't close a tag without closing all the tags it encloses.
870
That is, "<foo><bar></foo>" actually means
871
"<foo><bar></bar></foo>".
873
[Another possible explanation is "<foo><bar /></foo>", but since
874
this class defines no SELF_CLOSING_TAGS, it will never use that
877
This class is useful for parsing XML or made-up markup languages,
878
or when BeautifulSoup makes an assumption counter to what you were
882
for i in Tag.XML_SPECIAL_CHARS_TO_ENTITIES.values():
883
XML_ENTITY_LIST[i] = True
885
SELF_CLOSING_TAGS = {}
887
RESET_NESTING_TAGS = {}
890
MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
891
lambda x: x.group(1) + ' />'),
892
(re.compile('<!\s+([^<>]*)>'),
893
lambda x: '<!' + x.group(1) + '>')
896
ROOT_TAG_NAME = u'[document]'
898
HTML_ENTITIES = "html"
901
def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
902
markupMassage=True, smartQuotesTo=XML_ENTITIES,
903
convertEntities=None, selfClosingTags=None):
904
"""The Soup object is initialized as the 'root tag', and the
905
provided markup (which can be a string or a file-like object)
906
is fed into the underlying parser.
908
sgmllib will process most bad HTML, and the BeautifulSoup
909
class has some tricks for dealing with some HTML that kills
910
sgmllib, but Beautiful Soup can nonetheless choke or lose data
911
if your data uses self-closing tags or declarations
914
By default, Beautiful Soup uses regexes to sanitize input,
915
avoiding the vast majority of these problems. If the problems
916
don't apply to you, pass in False for markupMassage, and
917
you'll get better performance.
919
The default parser massage techniques fix the two most common
920
instances of invalid HTML that choke sgmllib:
922
<br/> (No space between name of closing tag and tag close)
923
<! --Comment--> (Extraneous whitespace in declaration)
925
You can pass in a custom list of (RE object, replace method)
926
tuples to get Beautiful Soup to scrub your input the way you
929
self.parseOnlyThese = parseOnlyThese
930
self.fromEncoding = fromEncoding
931
self.smartQuotesTo = smartQuotesTo
932
self.convertEntities = convertEntities
933
if self.convertEntities:
934
# It doesn't make sense to convert encoded characters to
935
# entities even while you're converting entities to Unicode.
936
# Just convert it all to Unicode.
937
self.smartQuotesTo = None
938
self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
939
SGMLParser.__init__(self)
941
if hasattr(markup, 'read'): # It's a file-type object.
942
markup = markup.read()
944
self.markupMassage = markupMassage
949
self.markup = None # The markup can now be GCed
951
def _feed(self, inDocumentEncoding=None):
952
# Convert the document to Unicode.
954
if isinstance(markup, unicode):
955
if not hasattr(self, 'originalEncoding'):
956
self.originalEncoding = None
958
dammit = UnicodeDammit\
959
(markup, [self.fromEncoding, inDocumentEncoding],
960
smartQuotesTo=self.smartQuotesTo)
961
markup = dammit.unicode
962
self.originalEncoding = dammit.originalEncoding
964
if self.markupMassage:
965
if not isList(self.markupMassage):
966
self.markupMassage = self.MARKUP_MASSAGE
967
for fix, m in self.markupMassage:
968
markup = fix.sub(m, markup)
971
SGMLParser.feed(self, markup)
972
# Close out any unfinished strings and close all the open tags.
974
while self.currentTag.name != self.ROOT_TAG_NAME:
977
def __getattr__(self, methodName):
978
"""This method routes method call requests to either the SGMLParser
979
superclass or the Tag superclass, depending on the method name."""
980
#print "__getattr__ called on %s.%s" % (self.__class__, methodName)
982
if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
983
or methodName.find('do_') == 0:
984
return SGMLParser.__getattr__(self, methodName)
985
elif methodName.find('__') != 0:
986
return Tag.__getattr__(self, methodName)
990
def isSelfClosingTag(self, name):
991
"""Returns true iff the given string is the name of a
992
self-closing tag according to this parser."""
993
return self.SELF_CLOSING_TAGS.has_key(name) \
994
or self.instanceSelfClosingTags.has_key(name)
997
Tag.__init__(self, self, self.ROOT_TAG_NAME)
999
SGMLParser.reset(self)
1000
self.currentData = []
1001
self.currentTag = None
1003
self.quoteStack = []
1007
tag = self.tagStack.pop()
1008
# Tags with just one string-owning child get the child as a
1009
# 'string' property, so that soup.tag.string is shorthand for
1010
# soup.tag.contents[0]
1011
if len(self.currentTag.contents) == 1 and \
1012
isinstance(self.currentTag.contents[0], NavigableString):
1013
self.currentTag.string = self.currentTag.contents[0]
1015
#print "Pop", tag.name
1017
self.currentTag = self.tagStack[-1]
1018
return self.currentTag
1020
def pushTag(self, tag):
1021
#print "Push", tag.name
1023
self.currentTag.append(tag)
1024
self.tagStack.append(tag)
1025
self.currentTag = self.tagStack[-1]
1027
def endData(self, containerClass=NavigableString):
1028
if self.currentData:
1029
currentData = ''.join(self.currentData)
1030
if not currentData.strip():
1031
if '\n' in currentData:
1035
self.currentData = []
1036
if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1037
(not self.parseOnlyThese.text or \
1038
not self.parseOnlyThese.search(currentData)):
1040
o = containerClass(currentData)
1041
o.setup(self.currentTag, self.previous)
1043
self.previous.next = o
1045
self.currentTag.contents.append(o)
1048
def _popToTag(self, name, inclusivePop=True):
1049
"""Pops the tag stack up to and including the most recent
1050
instance of the given tag. If inclusivePop is false, pops the tag
1051
stack up to but *not* including the most recent instqance of
1053
#print "Popping to %s" % name
1054
if name == self.ROOT_TAG_NAME:
1058
mostRecentTag = None
1059
for i in range(len(self.tagStack)-1, 0, -1):
1060
if name == self.tagStack[i].name:
1061
numPops = len(self.tagStack)-i
1063
if not inclusivePop:
1064
numPops = numPops - 1
1066
for i in range(0, numPops):
1067
mostRecentTag = self.popTag()
1068
return mostRecentTag
1070
def _smartPop(self, name):
1072
"""We need to pop up to the previous tag of this type, unless
1073
one of this tag's nesting reset triggers comes between this
1074
tag and the previous tag of this type, OR unless this tag is a
1075
generic nesting trigger and another generic nesting trigger
1076
comes between this tag and the previous tag of this type.
1079
<p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1080
<p>Foo<table>Bar<p> should pop to 'table', not 'p'.
1081
<p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.
1082
<p>Foo<b>Bar<p> should pop to 'p', not 'b'.
1084
<li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1085
<tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1086
<td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1089
nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1090
isNestable = nestingResetTriggers != None
1091
isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1094
for i in range(len(self.tagStack)-1, 0, -1):
1095
p = self.tagStack[i]
1096
if (not p or p.name == name) and not isNestable:
1097
#Non-nestable tags get popped to the top or to their
1101
if (nestingResetTriggers != None
1102
and p.name in nestingResetTriggers) \
1103
or (nestingResetTriggers == None and isResetNesting
1104
and self.RESET_NESTING_TAGS.has_key(p.name)):
1106
#If we encounter one of the nesting reset triggers
1107
#peculiar to this tag, or we encounter another tag
1108
#that causes nesting to reset, pop up to but not
1109
#including that tag.
1115
self._popToTag(popTo, inclusive)
1117
def unknown_starttag(self, name, attrs, selfClosing=0):
1118
#print "Start tag %s: %s" % (name, attrs)
1120
#This is not a real tag.
1121
#print "<%s> is not real!" % name
1122
attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1123
self.handle_data('<%s%s>' % (name, attrs))
1127
if not self.isSelfClosingTag(name) and not selfClosing:
1128
self._smartPop(name)
1130
if self.parseOnlyThese and len(self.tagStack) <= 1 \
1131
and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1134
tag = Tag(self, name, attrs, self.currentTag, self.previous)
1136
self.previous.next = tag
1139
if selfClosing or self.isSelfClosingTag(name):
1141
if name in self.QUOTE_TAGS:
1142
#print "Beginning quote (%s)" % name
1143
self.quoteStack.append(name)
1147
def unknown_endtag(self, name):
1148
#print "End tag %s" % name
1149
if self.quoteStack and self.quoteStack[-1] != name:
1150
#This is not a real end tag.
1151
#print "</%s> is not real!" % name
1152
self.handle_data('</%s>' % name)
1155
self._popToTag(name)
1156
if self.quoteStack and self.quoteStack[-1] == name:
1157
self.quoteStack.pop()
1158
self.literal = (len(self.quoteStack) > 0)
1160
def handle_data(self, data):
1161
self.currentData.append(data)
1163
def _toStringSubclass(self, text, subclass):
1164
"""Adds a certain piece of text to the tree as a NavigableString
1167
self.handle_data(text)
1168
self.endData(subclass)
1170
def handle_pi(self, text):
1171
"""Handle a processing instruction as a ProcessingInstruction
1172
object, possibly one with a %SOUP-ENCODING% slot into which an
1173
encoding will be plugged later."""
1174
if text[:3] == "xml":
1175
text = "xml version='1.0' encoding='%SOUP-ENCODING%'"
1176
self._toStringSubclass(text, ProcessingInstruction)
1178
def handle_comment(self, text):
1179
"Handle comments as Comment objects."
1180
self._toStringSubclass(text, Comment)
1182
def handle_charref(self, ref):
1183
"Handle character references as data."
1184
if self.convertEntities in [self.HTML_ENTITIES,
1186
data = unichr(int(ref))
1188
data = '&#%s;' % ref
1189
self.handle_data(data)
1191
def handle_entityref(self, ref):
1192
"""Handle entity references as data, possibly converting known
1193
HTML entity references to the corresponding Unicode
1196
if self.convertEntities == self.HTML_ENTITIES or \
1197
(self.convertEntities == self.XML_ENTITIES and \
1198
self.XML_ENTITY_LIST.get(ref)):
1200
data = unichr(name2codepoint[ref])
1205
self.handle_data(data)
1207
def handle_decl(self, data):
1208
"Handle DOCTYPEs and the like as Declaration objects."
1209
self._toStringSubclass(data, Declaration)
1211
def parse_declaration(self, i):
1212
"""Treat a bogus SGML declaration as raw data. Treat a CDATA
1213
declaration as a CData object."""
1215
if self.rawdata[i:i+9] == '<![CDATA[':
1216
k = self.rawdata.find(']]>', i)
1218
k = len(self.rawdata)
1219
data = self.rawdata[i+9:k]
1221
self._toStringSubclass(data, CData)
1224
j = SGMLParser.parse_declaration(self, i)
1225
except SGMLParseError:
1226
toHandle = self.rawdata[i:]
1227
self.handle_data(toHandle)
1228
j = i + len(toHandle)
1231
class BeautifulSoup(BeautifulStoneSoup):
1233
"""This parser knows the following facts about HTML:
1235
* Some tags have no closing tag and should be interpreted as being
1236
closed as soon as they are encountered.
1238
* The text inside some tags (ie. 'script') may contain tags which
1239
are not really part of the document and which should be parsed
1240
as text, not tags. If you want to parse the text as tags, you can
1241
always fetch it and parse it explicitly.
1243
* Tag nesting rules:
1245
Most tags can't be nested at all. For instance, the occurance of
1246
a <p> tag should implicitly close the previous <p> tag.
1249
should be transformed into:
1250
<p>Para1</p><p>Para2
1252
Some tags can be nested arbitrarily. For instance, the occurance
1253
of a <blockquote> tag should _not_ implicitly close the previous
1256
Alice said: <blockquote>Bob said: <blockquote>Blah
1257
should NOT be transformed into:
1258
Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1260
Some tags can be nested, but the nesting is reset by the
1261
interposition of other tags. For instance, a <tr> tag should
1262
implicitly close the previous <tr> tag within the same <table>,
1263
but not close a <tr> tag in another table.
1265
<table><tr>Blah<tr>Blah
1266
should be transformed into:
1267
<table><tr>Blah</tr><tr>Blah
1269
<tr>Blah<table><tr>Blah
1270
should NOT be transformed into
1271
<tr>Blah<table></tr><tr>Blah
1273
Differing assumptions about tag nesting rules are a major source
1274
of problems with the BeautifulSoup class. If BeautifulSoup is not
1275
treating as nestable a tag your page author treats as nestable,
1276
try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1277
BeautifulStoneSoup before writing your own subclass."""
1279
def __init__(self, *args, **kwargs):
1280
if not kwargs.has_key('smartQuotesTo'):
1281
kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1282
BeautifulStoneSoup.__init__(self, *args, **kwargs)
1284
SELF_CLOSING_TAGS = buildTagMap(None,
1285
['br' , 'hr', 'input', 'img', 'meta',
1286
'spacer', 'link', 'frame', 'base'])
1288
QUOTE_TAGS = {'script': None}
1290
#According to the HTML standard, each of these inline tags can
1291
#contain another tag of the same type. Furthermore, it's common
1292
#to actually use these tags this way.
1293
NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1296
#According to the HTML standard, these block tags can contain
1297
#another tag of the same type. Furthermore, it's common
1298
#to actually use these tags this way.
1299
NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
1301
#Lists can contain other lists, but there are restrictions.
1302
NESTABLE_LIST_TAGS = { 'ol' : [],
1304
'li' : ['ul', 'ol'],
1309
#Tables can contain other tables, but there are restrictions.
1310
NESTABLE_TABLE_TAGS = {'table' : [],
1311
'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1314
'thead' : ['table'],
1315
'tbody' : ['table'],
1316
'tfoot' : ['table'],
1319
NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
1321
#If one of these tags is encountered, all tags up to the next tag of
1322
#this type are popped.
1323
RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1324
NON_NESTABLE_BLOCK_TAGS,
1326
NESTABLE_TABLE_TAGS)
1328
NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1329
NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1331
# Used to detect the charset in a META tag; see start_meta
1332
CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
1334
def start_meta(self, attrs):
1335
"""Beautiful Soup can detect a charset included in a META tag,
1336
try to convert the document to that charset, and re-parse the
1337
document from the beginning."""
1340
contentTypeIndex = None
1341
tagNeedsEncodingSubstitution = False
1343
for i in range(0, len(attrs)):
1344
key, value = attrs[i]
1346
if key == 'http-equiv':
1348
elif key == 'content':
1350
contentTypeIndex = i
1352
if httpEquiv and contentType: # It's an interesting meta tag.
1353
match = self.CHARSET_RE.search(contentType)
1355
if getattr(self, 'declaredHTMLEncoding') or \
1356
(self.originalEncoding == self.fromEncoding):
1357
# This is our second pass through the document, or
1358
# else an encoding was specified explicitly and it
1359
# worked. Rewrite the meta tag.
1360
newAttr = self.CHARSET_RE.sub\
1361
(lambda(match):match.group(1) +
1362
"%SOUP-ENCODING%", value)
1363
attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1365
tagNeedsEncodingSubstitution = True
1367
# This is our first pass through the document.
1368
# Go through it again with the new information.
1369
newCharset = match.group(3)
1370
if newCharset and newCharset != self.originalEncoding:
1371
self.declaredHTMLEncoding = newCharset
1372
self._feed(self.declaredHTMLEncoding)
1374
tag = self.unknown_starttag("meta", attrs)
1375
if tag and tagNeedsEncodingSubstitution:
1376
tag.containsSubstitutions = True
1378
class StopParsing(Exception):
1381
class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1383
"""The BeautifulSoup class is oriented towards skipping over
1384
common HTML errors like unclosed tags. However, sometimes it makes
1385
errors of its own. For instance, consider this fragment:
1387
<b>Foo<b>Bar</b></b>
1389
This is perfectly valid (if bizarre) HTML. However, the
1390
BeautifulSoup class will implicitly close the first b tag when it
1391
encounters the second 'b'. It will think the author wrote
1392
"<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1393
there's no real-world reason to bold something that's already
1394
bold. When it encounters '</b></b>' it will close two more 'b'
1395
tags, for a grand total of three tags closed instead of two. This
1396
can throw off the rest of your document structure. The same is
1397
true of a number of other tags, listed below.
1399
It's much more common for someone to forget to close a 'b' tag
1400
than to actually use nested 'b' tags, and the BeautifulSoup class
1401
handles the common case. This class handles the not-co-common
1402
case: where you can't believe someone wrote what they did, but
1403
it's valid HTML and BeautifulSoup screwed up by assuming it
1406
I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1407
['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1408
'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1411
I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1413
NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1414
I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1415
I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1417
class MinimalSoup(BeautifulSoup):
1418
"""The MinimalSoup class is for parsing HTML that contains
1419
pathologically bad markup. It makes no assumptions about tag
1420
nesting, but it does know which tags are self-closing, that
1421
<script> tags contain Javascript and should not be parsed, that
1422
META tags may contain encoding information, and so on.
1424
This also makes it better for subclassing than BeautifulStoneSoup
1425
or BeautifulSoup."""
1427
RESET_NESTING_TAGS = buildTagMap('noscript')
1430
class BeautifulSOAP(BeautifulStoneSoup):
1431
"""This class will push a tag with only a single string child into
1432
the tag's parent as an attribute. The attribute's name is the tag
1433
name, and the value is the string child. An example should give
1434
the flavor of the change:
1436
<foo><bar>baz</bar></foo>
1438
<foo bar="baz"><bar>baz</bar></foo>
1440
You can then access fooTag['bar'] instead of fooTag.barTag.string.
1442
This is, of course, useful for scraping structures that tend to
1443
use subelements instead of attributes, such as SOAP messages. Note
1444
that it modifies its input, so don't print the modified version
1447
I'm not sure how many people really want to use this class; let me
1448
know if you do. Mainly I like the name."""
1451
if len(self.tagStack) > 1:
1452
tag = self.tagStack[-1]
1453
parent = self.tagStack[-2]
1454
parent._getAttrMap()
1455
if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1456
isinstance(tag.contents[0], NavigableString) and
1457
not parent.attrMap.has_key(tag.name)):
1458
parent[tag.name] = tag.contents[0]
1459
BeautifulStoneSoup.popTag(self)
1461
#Enterprise class names! It has come to our attention that some people
1462
#think the names of the Beautiful Soup parser classes are too silly
1463
#and "unprofessional" for use in enterprise screen-scraping. We feel
1464
#your pain! For such-minded folk, the Beautiful Soup Consortium And
1465
#All-Night Kosher Bakery recommends renaming this file to
1466
#"RobustParser.py" (or, in cases of extreme enterprisness,
1467
#"RobustParserBeanInterface.class") and using the following
1468
#enterprise-friendly class aliases:
1469
class RobustXMLParser(BeautifulStoneSoup):
1471
class RobustHTMLParser(BeautifulSoup):
1473
class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1475
class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1477
class SimplifyingSOAPParser(BeautifulSOAP):
1480
######################################################
1482
# Bonus library: Unicode, Dammit
1484
# This class forces XML data into a standard format (usually to UTF-8
1485
# or Unicode). It is heavily based on code from Mark Pilgrim's
1486
# Universal Feed Parser. It does not rewrite the XML or HTML to
1487
# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1488
# (XML) and BeautifulSoup.start_meta (HTML).
1490
# Autodetects character encodings.
1491
# Download from http://chardet.feedparser.org/
1494
# import chardet.constants
1495
# chardet.constants._debug = 1
1500
# cjkcodecs and iconv_codec make Python know about more character encodings.
1501
# Both are available from http://cjkpython.i18n.org/
1502
# They're built in if you use Python 2.4.
1504
import cjkcodecs.aliases
1512
class UnicodeDammit:
1513
"""A class for detecting the encoding of a *ML document and
1514
converting it to a Unicode string. If the source encoding is
1515
windows-1252, can replace MS smart quotes with their HTML or XML
1518
# This dictionary maps commonly seen values for "charset" in HTML
1519
# meta tags to the corresponding Python codec names. It only covers
1520
# values that aren't in Python's aliases and can't be determined
1521
# by the heuristics in find_codec.
1522
CHARSET_ALIASES = { "macintosh" : "mac-roman",
1523
"x-sjis" : "shift-jis" }
1525
def __init__(self, markup, overrideEncodings=[],
1526
smartQuotesTo='xml'):
1527
self.markup, documentEncoding, sniffedEncoding = \
1528
self._detectEncoding(markup)
1529
self.smartQuotesTo = smartQuotesTo
1530
self.triedEncodings = []
1531
if markup == '' or isinstance(markup, unicode):
1532
self.originalEncoding = None
1533
self.unicode = unicode(markup)
1537
for proposedEncoding in overrideEncodings:
1538
u = self._convertFrom(proposedEncoding)
1541
for proposedEncoding in (documentEncoding, sniffedEncoding):
1542
u = self._convertFrom(proposedEncoding)
1545
# If no luck and we have auto-detection library, try that:
1546
if not u and chardet and not isinstance(self.markup, unicode):
1547
u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1549
# As a last resort, try utf-8 and windows-1252:
1551
for proposed_encoding in ("utf-8", "windows-1252"):
1552
u = self._convertFrom(proposed_encoding)
1555
if not u: self.originalEncoding = None
1557
def _subMSChar(self, orig):
1558
"""Changes a MS smart quote character to an XML or HTML
1560
sub = self.MS_CHARS.get(orig)
1561
if type(sub) == types.TupleType:
1562
if self.smartQuotesTo == 'xml':
1563
sub = '&#x%s;' % sub[1]
1565
sub = '&%s;' % sub[0]
1568
def _convertFrom(self, proposed):
1569
proposed = self.find_codec(proposed)
1570
if not proposed or proposed in self.triedEncodings:
1572
self.triedEncodings.append(proposed)
1573
markup = self.markup
1575
# Convert smart quotes to HTML if coming from an encoding
1576
# that might have them.
1577
if self.smartQuotesTo and proposed.lower() in("windows-1252",
1580
markup = re.compile("([\x80-\x9f])").sub \
1581
(lambda(x): self._subMSChar(x.group(1)),
1585
# print "Trying to convert document to %s" % proposed
1586
u = self._toUnicode(markup, proposed)
1588
self.originalEncoding = proposed
1589
except Exception, e:
1590
# print "That didn't work!"
1593
#print "Correct encoding: %s" % proposed
1596
def _toUnicode(self, data, encoding):
1597
'''Given a string and its encoding, decodes the string into Unicode.
1598
%encoding is a string recognized by encodings.aliases'''
1600
# strip Byte Order Mark (if present)
1601
if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1602
and (data[2:4] != '\x00\x00'):
1603
encoding = 'utf-16be'
1605
elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1606
and (data[2:4] != '\x00\x00'):
1607
encoding = 'utf-16le'
1609
elif data[:3] == '\xef\xbb\xbf':
1612
elif data[:4] == '\x00\x00\xfe\xff':
1613
encoding = 'utf-32be'
1615
elif data[:4] == '\xff\xfe\x00\x00':
1616
encoding = 'utf-32le'
1618
newdata = unicode(data, encoding)
1621
def _detectEncoding(self, xml_data):
1622
"""Given a document, tries to detect its XML encoding."""
1623
xml_encoding = sniffed_xml_encoding = None
1625
if xml_data[:4] == '\x4c\x6f\xa7\x94':
1627
xml_data = self._ebcdic_to_ascii(xml_data)
1628
elif xml_data[:4] == '\x00\x3c\x00\x3f':
1630
sniffed_xml_encoding = 'utf-16be'
1631
xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1632
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1633
and (xml_data[2:4] != '\x00\x00'):
1635
sniffed_xml_encoding = 'utf-16be'
1636
xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1637
elif xml_data[:4] == '\x3c\x00\x3f\x00':
1639
sniffed_xml_encoding = 'utf-16le'
1640
xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1641
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1642
(xml_data[2:4] != '\x00\x00'):
1644
sniffed_xml_encoding = 'utf-16le'
1645
xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1646
elif xml_data[:4] == '\x00\x00\x00\x3c':
1648
sniffed_xml_encoding = 'utf-32be'
1649
xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1650
elif xml_data[:4] == '\x3c\x00\x00\x00':
1652
sniffed_xml_encoding = 'utf-32le'
1653
xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1654
elif xml_data[:4] == '\x00\x00\xfe\xff':
1656
sniffed_xml_encoding = 'utf-32be'
1657
xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1658
elif xml_data[:4] == '\xff\xfe\x00\x00':
1660
sniffed_xml_encoding = 'utf-32le'
1661
xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1662
elif xml_data[:3] == '\xef\xbb\xbf':
1664
sniffed_xml_encoding = 'utf-8'
1665
xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1667
sniffed_xml_encoding = 'ascii'
1669
xml_encoding_match = re.compile \
1670
('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
1673
xml_encoding_match = None
1674
if xml_encoding_match:
1675
xml_encoding = xml_encoding_match.groups()[0].lower()
1676
if sniffed_xml_encoding and \
1677
(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1678
'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1679
'utf-16', 'utf-32', 'utf_16', 'utf_32',
1681
xml_encoding = sniffed_xml_encoding
1682
return xml_data, xml_encoding, sniffed_xml_encoding
1685
def find_codec(self, charset):
1686
return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1687
or (charset and self._codec(charset.replace("-", ""))) \
1688
or (charset and self._codec(charset.replace("-", "_"))) \
1691
def _codec(self, charset):
1692
if not charset: return charset
1695
codecs.lookup(charset)
1701
EBCDIC_TO_ASCII_MAP = None
1702
def _ebcdic_to_ascii(self, s):
1704
if not c.EBCDIC_TO_ASCII_MAP:
1705
emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1706
16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1707
128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1708
144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1709
32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1710
38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1711
45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1712
186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1713
195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1714
201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1715
206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1716
211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1717
225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1718
73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1719
82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1720
90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1721
250,251,252,253,254,255)
1723
c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1724
''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1725
return s.translate(c.EBCDIC_TO_ASCII_MAP)
1727
MS_CHARS = { '\x80' : ('euro', '20AC'),
1729
'\x82' : ('sbquo', '201A'),
1730
'\x83' : ('fnof', '192'),
1731
'\x84' : ('bdquo', '201E'),
1732
'\x85' : ('hellip', '2026'),
1733
'\x86' : ('dagger', '2020'),
1734
'\x87' : ('Dagger', '2021'),
1735
'\x88' : ('circ', '2C6'),
1736
'\x89' : ('permil', '2030'),
1737
'\x8A' : ('Scaron', '160'),
1738
'\x8B' : ('lsaquo', '2039'),
1739
'\x8C' : ('OElig', '152'),
1741
'\x8E' : ('#x17D', '17D'),
1744
'\x91' : ('lsquo', '2018'),
1745
'\x92' : ('rsquo', '2019'),
1746
'\x93' : ('ldquo', '201C'),
1747
'\x94' : ('rdquo', '201D'),
1748
'\x95' : ('bull', '2022'),
1749
'\x96' : ('ndash', '2013'),
1750
'\x97' : ('mdash', '2014'),
1751
'\x98' : ('tilde', '2DC'),
1752
'\x99' : ('trade', '2122'),
1753
'\x9a' : ('scaron', '161'),
1754
'\x9b' : ('rsaquo', '203A'),
1755
'\x9c' : ('oelig', '153'),
1757
'\x9e' : ('#x17E', '17E'),
1758
'\x9f' : ('Yuml', ''),}
1760
#######################################################################
1763
#By default, act as an HTML pretty-printer.
1764
if __name__ == '__main__':
1766
soup = BeautifulSoup(sys.stdin.read())
1767
print soup.prettify()