3
"The Screen-Scraper's Friend"
4
http://www.crummy.com/software/BeautifulSoup/
6
Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7
tree representation. It provides methods and Pythonic idioms that make
8
it easy to navigate, search, and modify the tree.
10
A well-formed XML/HTML document yields a well-formed data
11
structure. An ill-formed XML/HTML document yields a correspondingly
12
ill-formed data structure. If your document is only locally
13
well-formed, you can use this library to find and process the
14
well-formed part of it.
16
Beautiful Soup works with Python 2.2 and up. It has no external
17
dependencies, but you'll have more success at converting data to UTF-8
18
if you also install these three packages:
20
* chardet, for auto-detecting character encodings
21
http://chardet.feedparser.org/
22
* cjkcodecs and iconv_codec, which add more encodings to the ones supported
24
http://cjkpython.i18n.org/
26
Beautiful Soup defines classes for two main parsing strategies:
28
* BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29
language that kind of looks like XML.
31
* BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32
or invalid. This class has web browser-like heuristics for
33
obtaining a sensible parse tree in the face of common HTML errors.
35
Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36
the encoding of an HTML or XML document, and converting it to
37
Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
39
For more than you ever wanted to know about Beautiful Soup, see the
41
http://www.crummy.com/software/BeautifulSoup/documentation.html
43
Here, have some legalese:
45
Copyright (c) 2004-2007, Leonard Richardson
49
Redistribution and use in source and binary forms, with or without
50
modification, are permitted provided that the following conditions are
53
* Redistributions of source code must retain the above copyright
54
notice, this list of conditions and the following disclaimer.
56
* Redistributions in binary form must reproduce the above
57
copyright notice, this list of conditions and the following
58
disclaimer in the documentation and/or other materials provided
59
with the distribution.
61
* Neither the name of the the Beautiful Soup Consortium and All
62
Night Kosher Bakery nor the names of its contributors may be
63
used to endorse or promote products derived from this software
64
without specific prior written permission.
66
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
79
from __future__ import generators
81
__author__ = "Leonard Richardson (leonardr@segfault.org)"
83
__copyright__ = "Copyright (c) 2004-2007 Leonard Richardson"
84
__license__ = "New-style BSD"
86
from sgmllib import SGMLParser, SGMLParseError
92
from htmlentitydefs import name2codepoint
96
#This hack makes Beautiful Soup able to parse XML with namespaces
97
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
99
DEFAULT_OUTPUT_ENCODING = "utf-8"
101
# First, the classes that represent markup elements.
104
"""Contains the navigational information for some part of the page
105
(either a tag or a piece of text)"""
107
def setup(self, parent=None, previous=None):
108
"""Sets up the initial relations between this element and
111
self.previous = previous
113
self.previousSibling = None
114
self.nextSibling = None
115
if self.parent and self.parent.contents:
116
self.previousSibling = self.parent.contents[-1]
117
self.previousSibling.nextSibling = self
119
def replaceWith(self, replaceWith):
120
oldParent = self.parent
121
myIndex = self.parent.contents.index(self)
122
if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
123
# We're replacing this element with one of its siblings.
124
index = self.parent.contents.index(replaceWith)
125
if index and index < myIndex:
126
# Furthermore, it comes before this element. That
127
# means that when we extract it, the index of this
128
# element will change.
129
myIndex = myIndex - 1
131
oldParent.insert(myIndex, replaceWith)
134
"""Destructively rips this element out of the tree."""
137
self.parent.contents.remove(self)
141
#Find the two elements that would be next to each other if
142
#this element (and any children) hadn't been parsed. Connect
144
lastChild = self._lastRecursiveChild()
145
nextElement = lastChild.next
148
self.previous.next = nextElement
150
nextElement.previous = self.previous
152
lastChild.next = None
155
if self.previousSibling:
156
self.previousSibling.nextSibling = self.nextSibling
158
self.nextSibling.previousSibling = self.previousSibling
159
self.previousSibling = self.nextSibling = None
161
def _lastRecursiveChild(self):
162
"Finds the last element beneath this object to be parsed."
164
while hasattr(lastChild, 'contents') and lastChild.contents:
165
lastChild = lastChild.contents[-1]
168
def insert(self, position, newChild):
169
if (isinstance(newChild, basestring)
170
or isinstance(newChild, unicode)) \
171
and not isinstance(newChild, NavigableString):
172
newChild = NavigableString(newChild)
174
position = min(position, len(self.contents))
175
if hasattr(newChild, 'parent') and newChild.parent != None:
176
# We're 'inserting' an element that's already one
177
# of this object's children.
178
if newChild.parent == self:
179
index = self.find(newChild)
180
if index and index < position:
181
# Furthermore we're moving it further down the
182
# list of this object's children. That means that
183
# when we extract this element, our target index
184
# will jump down one.
185
position = position - 1
188
newChild.parent = self
191
newChild.previousSibling = None
192
newChild.previous = self
194
previousChild = self.contents[position-1]
195
newChild.previousSibling = previousChild
196
newChild.previousSibling.nextSibling = newChild
197
newChild.previous = previousChild._lastRecursiveChild()
198
if newChild.previous:
199
newChild.previous.next = newChild
201
newChildsLastElement = newChild._lastRecursiveChild()
203
if position >= len(self.contents):
204
newChild.nextSibling = None
207
parentsNextSibling = None
208
while not parentsNextSibling:
209
parentsNextSibling = parent.nextSibling
210
parent = parent.parent
211
if not parent: # This is the last element in the document.
213
if parentsNextSibling:
214
newChildsLastElement.next = parentsNextSibling
216
newChildsLastElement.next = None
218
nextChild = self.contents[position]
219
newChild.nextSibling = nextChild
220
if newChild.nextSibling:
221
newChild.nextSibling.previousSibling = newChild
222
newChildsLastElement.next = nextChild
224
if newChildsLastElement.next:
225
newChildsLastElement.next.previous = newChildsLastElement
226
self.contents.insert(position, newChild)
228
def append(self, tag):
229
"""Appends the given tag to the contents of this tag."""
230
self.insert(len(self.contents), tag)
232
def findNext(self, name=None, attrs={}, text=None, **kwargs):
233
"""Returns the first item that matches the given criteria and
234
appears after this Tag in the document."""
235
return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
237
def findAllNext(self, name=None, attrs={}, text=None, limit=None,
239
"""Returns all items that match the given criteria and appear
240
before after Tag in the document."""
241
return self._findAll(name, attrs, text, limit, self.nextGenerator)
243
def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
244
"""Returns the closest sibling to this Tag that matches the
245
given criteria and appears after this Tag in the document."""
246
return self._findOne(self.findNextSiblings, name, attrs, text,
249
def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
251
"""Returns the siblings of this Tag that match the given
252
criteria and appear after this Tag in the document."""
253
return self._findAll(name, attrs, text, limit,
254
self.nextSiblingGenerator, **kwargs)
255
fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
257
def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
258
"""Returns the first item that matches the given criteria and
259
appears before this Tag in the document."""
260
return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
262
def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
264
"""Returns all items that match the given criteria and appear
265
before this Tag in the document."""
266
return self._findAll(name, attrs, text, limit, self.previousGenerator,
268
fetchPrevious = findAllPrevious # Compatibility with pre-3.x
270
def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
271
"""Returns the closest sibling to this Tag that matches the
272
given criteria and appears before this Tag in the document."""
273
return self._findOne(self.findPreviousSiblings, name, attrs, text,
276
def findPreviousSiblings(self, name=None, attrs={}, text=None,
277
limit=None, **kwargs):
278
"""Returns the siblings of this Tag that match the given
279
criteria and appear before this Tag in the document."""
280
return self._findAll(name, attrs, text, limit,
281
self.previousSiblingGenerator, **kwargs)
282
fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
284
def findParent(self, name=None, attrs={}, **kwargs):
285
"""Returns the closest parent of this Tag that matches the given
287
# NOTE: We can't use _findOne because findParents takes a different
290
l = self.findParents(name, attrs, 1)
295
def findParents(self, name=None, attrs={}, limit=None, **kwargs):
296
"""Returns the parents of this Tag that match the given
299
return self._findAll(name, attrs, None, limit, self.parentGenerator,
301
fetchParents = findParents # Compatibility with pre-3.x
303
#These methods do the real heavy lifting.
305
def _findOne(self, method, name, attrs, text, **kwargs):
307
l = method(name, attrs, text, 1, **kwargs)
312
def _findAll(self, name, attrs, text, limit, generator, **kwargs):
313
"Iterates over a generator looking for things that match."
315
if isinstance(name, SoupStrainer):
318
# Build a SoupStrainer
319
strainer = SoupStrainer(name, attrs, text, **kwargs)
320
results = ResultSet(strainer)
325
except StopIteration:
328
found = strainer.search(i)
330
results.append(found)
331
if limit and len(results) >= limit:
335
#These Generators can be used to navigate starting from both
336
#NavigableStrings and Tags.
337
def nextGenerator(self):
343
def nextSiblingGenerator(self):
349
def previousGenerator(self):
355
def previousSiblingGenerator(self):
358
i = i.previousSibling
361
def parentGenerator(self):
368
def substituteEncoding(self, str, encoding=None):
369
encoding = encoding or "utf-8"
370
return str.replace("%SOUP-ENCODING%", encoding)
372
def toEncoding(self, s, encoding=None):
373
"""Encodes an object to a string in some encoding, or to Unicode.
375
if isinstance(s, unicode):
377
s = s.encode(encoding)
378
elif isinstance(s, str):
380
s = s.encode(encoding)
385
s = self.toEncoding(str(s), encoding)
390
class NavigableString(unicode, PageElement):
392
def __getnewargs__(self):
393
return (NavigableString.__str__(self),)
395
def __getattr__(self, attr):
396
"""text.string gives you text. This is for backwards
397
compatibility for Navigable*String, but for CData* it lets you
398
get the string without the CData wrapper."""
402
raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
404
def __unicode__(self):
405
return unicode(str(self))
407
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
409
return self.encode(encoding)
413
class CData(NavigableString):
415
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
416
return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
418
class ProcessingInstruction(NavigableString):
419
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
421
if "%SOUP-ENCODING%" in output:
422
output = self.substituteEncoding(output, encoding)
423
return "<?%s?>" % self.toEncoding(output, encoding)
425
class Comment(NavigableString):
426
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
427
return "<!--%s-->" % NavigableString.__str__(self, encoding)
429
class Declaration(NavigableString):
430
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
431
return "<!%s>" % NavigableString.__str__(self, encoding)
433
class Tag(PageElement):
435
"""Represents a found HTML tag with its attributes and contents."""
438
"Cheap function to invert a hash."
440
for k,v in h.items():
444
XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
450
XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
452
def _convertEntities(self, match):
453
"""Used in a call to re.sub to replace HTML, XML, and numeric
454
entities with the appropriate Unicode characters. If HTML
455
entities are being converted, any unrecognized entities are
458
if self.convertHTMLEntities and x in name2codepoint:
459
return unichr(name2codepoint[x])
460
elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
461
if self.convertXMLEntities:
462
return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
465
elif len(x) > 0 and x[0] == '#':
466
# Handle numeric entities
467
if len(x) > 1 and x[1] == 'x':
468
return unichr(int(x[2:], 16))
470
return unichr(int(x[1:]))
472
elif self.escapeUnrecognizedEntities:
473
return u'&%s;' % x
477
def __init__(self, parser, name, attrs=None, parent=None,
481
# We don't actually store the parser object: that lets extracted
482
# chunks be garbage-collected
483
self.parserClass = parser.__class__
484
self.isSelfClosing = parser.isSelfClosingTag(name)
490
self.setup(parent, previous)
492
self.containsSubstitutions = False
493
self.convertHTMLEntities = parser.convertHTMLEntities
494
self.convertXMLEntities = parser.convertXMLEntities
495
self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
497
# Convert any HTML, XML, or numeric entities in the attribute values.
498
convert = lambda(k, val): (k,
499
re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
500
self._convertEntities,
502
self.attrs = map(convert, self.attrs)
504
def get(self, key, default=None):
505
"""Returns the value of the 'key' attribute for the tag, or
506
the value given for 'default' if it doesn't have that
508
return self._getAttrMap().get(key, default)
510
def has_key(self, key):
511
return self._getAttrMap().has_key(key)
513
def __getitem__(self, key):
514
"""tag[key] returns the value of the 'key' attribute for the tag,
515
and throws an exception if it's not there."""
516
return self._getAttrMap()[key]
519
"Iterating over a tag iterates over its contents."
520
return iter(self.contents)
523
"The length of a tag is the length of its list of contents."
524
return len(self.contents)
526
def __contains__(self, x):
527
return x in self.contents
529
def __nonzero__(self):
530
"A tag is non-None even if it has no contents."
533
def __setitem__(self, key, value):
534
"""Setting tag[key] sets the value of the 'key' attribute for the
537
self.attrMap[key] = value
539
for i in range(0, len(self.attrs)):
540
if self.attrs[i][0] == key:
541
self.attrs[i] = (key, value)
544
self.attrs.append((key, value))
545
self._getAttrMap()[key] = value
547
def __delitem__(self, key):
548
"Deleting tag[key] deletes all 'key' attributes for the tag."
549
for item in self.attrs:
551
self.attrs.remove(item)
552
#We don't break because bad HTML can define the same
553
#attribute multiple times.
555
if self.attrMap.has_key(key):
556
del self.attrMap[key]
558
def __call__(self, *args, **kwargs):
559
"""Calling a tag like a function is the same as calling its
560
findAll() method. Eg. tag('a') returns a list of all the A tags
561
found within this tag."""
562
return apply(self.findAll, args, kwargs)
564
def __getattr__(self, tag):
565
#print "Getattr %s.%s" % (self.__class__, tag)
566
if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
567
return self.find(tag[:-3])
568
elif tag.find('__') != 0:
569
return self.find(tag)
570
raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
572
def __eq__(self, other):
573
"""Returns true iff this tag has the same name, the same attributes,
574
and the same contents (recursively) as the given tag.
576
NOTE: right now this will return false if two tags have the
577
same attributes in a different order. Should this be fixed?"""
578
if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
580
for i in range(0, len(self.contents)):
581
if self.contents[i] != other.contents[i]:
585
def __ne__(self, other):
586
"""Returns true iff this tag is not identical to the other tag,
587
as defined in __eq__."""
588
return not self == other
590
def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
591
"""Renders this tag as a string."""
592
return self.__str__(encoding)
594
def __unicode__(self):
595
return self.__str__(None)
597
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
598
+ "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
601
def _sub_entity(self, x):
602
"""Used with a regular expression to substitute the
603
appropriate XML entity for an XML special character."""
604
return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
606
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
607
prettyPrint=False, indentLevel=0):
608
"""Returns a string or Unicode representation of this tag and
609
its contents. To get Unicode, pass None for encoding.
611
NOTE: since Python's HTML parser consumes whitespace, this
612
method is not certain to reproduce the whitespace present in
613
the original string."""
615
encodedName = self.toEncoding(self.name, encoding)
619
for key, val in self.attrs:
622
if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
623
val = self.substituteEncoding(val, encoding)
625
# The attribute value either:
627
# * Contains no embedded double quotes or single quotes.
628
# No problem: we enclose it in double quotes.
629
# * Contains embedded single quotes. No problem:
630
# double quotes work here too.
631
# * Contains embedded double quotes. No problem:
632
# we enclose it in single quotes.
633
# * Embeds both single _and_ double quotes. This
634
# can't happen naturally, but it can happen if
635
# you modify an attribute value after parsing
636
# the document. Now we have a bit of a
637
# problem. We solve it by enclosing the
638
# attribute in single quotes, and escaping any
639
# embedded single quotes to XML entities.
643
# TODO: replace with apos when
645
val = val.replace("'", "&squot;")
647
# Now we're okay w/r/t quotes. But the attribute
648
# value might also contain angle brackets, or
649
# ampersands that aren't part of entities. We need
650
# to escape those to XML entities too.
651
val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
653
attrs.append(fmt % (self.toEncoding(key, encoding),
654
self.toEncoding(val, encoding)))
657
if self.isSelfClosing:
660
closeTag = '</%s>' % encodedName
662
indentTag, indentContents = 0, 0
664
indentTag = indentLevel
665
space = (' ' * (indentTag-1))
666
indentContents = indentTag + 1
667
contents = self.renderContents(encoding, prettyPrint, indentContents)
674
attributeString = ' ' + ' '.join(attrs)
677
s.append('<%s%s%s>' % (encodedName, attributeString, close))
681
if prettyPrint and contents and contents[-1] != "\n":
683
if prettyPrint and closeTag:
686
if prettyPrint and closeTag and self.nextSibling:
691
def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
692
return self.__str__(encoding, True)
694
def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
695
prettyPrint=False, indentLevel=0):
696
"""Renders the contents of this tag as a string in the given
697
encoding. If encoding is None, returns a Unicode string.."""
701
if isinstance(c, NavigableString):
702
text = c.__str__(encoding)
703
elif isinstance(c, Tag):
704
s.append(c.__str__(encoding, prettyPrint, indentLevel))
705
if text and prettyPrint:
709
s.append(" " * (indentLevel-1))
717
def find(self, name=None, attrs={}, recursive=True, text=None,
719
"""Return only the first child of this Tag matching the given
722
l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
728
def findAll(self, name=None, attrs={}, recursive=True, text=None,
729
limit=None, **kwargs):
730
"""Extracts a list of Tag objects that match the given
731
criteria. You can specify the name of the Tag and any
732
attributes you want the Tag to have.
734
The value of a key-value pair in the 'attrs' map can be a
735
string, a list of strings, a regular expression object, or a
736
callable that takes a string and returns whether or not the
737
string matches for some custom definition of 'matches'. The
738
same is true of the tag name."""
739
generator = self.recursiveChildGenerator
741
generator = self.childGenerator
742
return self._findAll(name, attrs, text, limit, generator, **kwargs)
743
findChildren = findAll
745
# Pre-3.x compatibility methods
749
def fetchText(self, text=None, recursive=True, limit=None):
750
return self.findAll(text=text, recursive=recursive, limit=limit)
752
def firstText(self, text=None, recursive=True):
753
return self.find(text=text, recursive=recursive)
757
def _getAttrMap(self):
758
"""Initializes a map representation of this tag's attributes,
759
if not already initialized."""
760
if not getattr(self, 'attrMap'):
762
for (key, value) in self.attrs:
763
self.attrMap[key] = value
767
def childGenerator(self):
768
for i in range(0, len(self.contents)):
769
yield self.contents[i]
772
def recursiveChildGenerator(self):
775
tag, start = stack.pop()
776
if isinstance(tag, Tag):
777
for i in range(start, len(tag.contents)):
780
if isinstance(a, Tag) and tag.contents:
781
if i < len(tag.contents) - 1:
782
stack.append((tag, i+1))
787
# Next, a couple classes to represent queries and their results.
789
"""Encapsulates a number of ways of matching a markup element (tag or
792
def __init__(self, name=None, attrs={}, text=None, **kwargs):
795
kwargs['class'] = attrs
810
return "%s|%s" % (self.name, self.attrs)
812
def searchTag(self, markupName=None, markupAttrs={}):
815
if isinstance(markupName, Tag):
818
callFunctionWithTagData = callable(self.name) \
819
and not isinstance(markupName, Tag)
822
or callFunctionWithTagData \
823
or (markup and self._matches(markup, self.name)) \
824
or (not markup and self._matches(markupName, self.name)):
825
if callFunctionWithTagData:
826
match = self.name(markupName, markupAttrs)
830
for attr, matchAgainst in self.attrs.items():
831
if not markupAttrMap:
832
if hasattr(markupAttrs, 'get'):
833
markupAttrMap = markupAttrs
836
for k,v in markupAttrs:
838
attrValue = markupAttrMap.get(attr)
839
if not self._matches(attrValue, matchAgainst):
849
def search(self, markup):
850
#print 'looking for %s in %s' % (self, markup)
852
# If given a list of items, scan it for a text element that
854
if isList(markup) and not isinstance(markup, Tag):
855
for element in markup:
856
if isinstance(element, NavigableString) \
857
and self.search(element):
860
# If it's a Tag, make sure its name or attributes match.
861
# Don't bother with Tags if we're searching for text.
862
elif isinstance(markup, Tag):
864
found = self.searchTag(markup)
865
# If it's text, make sure the text matches.
866
elif isinstance(markup, NavigableString) or \
868
if self._matches(markup, self.text):
871
raise Exception, "I don't know how to match against a %s" \
875
def _matches(self, markup, matchAgainst):
876
#print "Matching %s against %s" % (markup, matchAgainst)
878
if matchAgainst == True and type(matchAgainst) == types.BooleanType:
879
result = markup != None
880
elif callable(matchAgainst):
881
result = matchAgainst(markup)
883
#Custom match methods take the tag as an argument, but all
884
#other ways of matching match the tag name as a string.
885
if isinstance(markup, Tag):
887
if markup and not isString(markup):
888
markup = unicode(markup)
889
#Now we know that chunk is either a string, or None.
890
if hasattr(matchAgainst, 'match'):
891
# It's a regexp object.
892
result = markup and matchAgainst.search(markup)
893
elif isList(matchAgainst):
894
result = markup in matchAgainst
895
elif hasattr(matchAgainst, 'items'):
896
result = markup.has_key(matchAgainst)
897
elif matchAgainst and isString(markup):
898
if isinstance(markup, unicode):
899
matchAgainst = unicode(matchAgainst)
901
matchAgainst = str(matchAgainst)
904
result = matchAgainst == markup
907
class ResultSet(list):
908
"""A ResultSet is just a list that keeps track of the SoupStrainer
910
def __init__(self, source):
914
# Now, some helper functions.
917
"""Convenience method that works with all 2.x versions of Python
918
to determine whether or not something is listlike."""
919
return hasattr(l, '__iter__') \
920
or (type(l) in (types.ListType, types.TupleType))
923
"""Convenience method that works with all 2.x versions of Python
924
to determine whether or not something is stringlike."""
926
return isinstance(s, unicode) or isinstance(s, basestring)
928
return isinstance(s, str)
930
def buildTagMap(default, *args):
931
"""Turns a list of maps, lists, or scalars into a single map.
932
Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
933
NESTING_RESET_TAGS maps out of lists and partial maps."""
936
if hasattr(portion, 'items'):
937
#It's a map. Merge it.
938
for k,v in portion.items():
940
elif isList(portion):
941
#It's a list. Map each item to the default.
945
#It's a scalar. Map it to the default.
946
built[portion] = default
949
# Now, the parser classes.
951
class BeautifulStoneSoup(Tag, SGMLParser):
953
"""This class contains the basic parser and search code. It defines
954
a parser that knows nothing about tag behavior except for the
957
You can't close a tag without closing all the tags it encloses.
958
That is, "<foo><bar></foo>" actually means
959
"<foo><bar></bar></foo>".
961
[Another possible explanation is "<foo><bar /></foo>", but since
962
this class defines no SELF_CLOSING_TAGS, it will never use that
965
This class is useful for parsing XML or made-up markup languages,
966
or when BeautifulSoup makes an assumption counter to what you were
969
SELF_CLOSING_TAGS = {}
971
RESET_NESTING_TAGS = {}
974
MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
975
lambda x: x.group(1) + ' />'),
976
(re.compile('<!\s+([^<>]*)>'),
977
lambda x: '<!' + x.group(1) + '>')
980
ROOT_TAG_NAME = u'[document]'
982
HTML_ENTITIES = "html"
984
XHTML_ENTITIES = "xhtml"
985
# TODO: This only exists for backwards-compatibility
986
ALL_ENTITIES = XHTML_ENTITIES
988
# Used when determining whether a text node is all whitespace and
989
# can be replaced with a single space. A text node that contains
990
# fancy Unicode spaces (usually non-breaking) should be left
992
STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
994
def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
995
markupMassage=True, smartQuotesTo=XML_ENTITIES,
996
convertEntities=None, selfClosingTags=None):
997
"""The Soup object is initialized as the 'root tag', and the
998
provided markup (which can be a string or a file-like object)
999
is fed into the underlying parser.
1001
sgmllib will process most bad HTML, and the BeautifulSoup
1002
class has some tricks for dealing with some HTML that kills
1003
sgmllib, but Beautiful Soup can nonetheless choke or lose data
1004
if your data uses self-closing tags or declarations
1007
By default, Beautiful Soup uses regexes to sanitize input,
1008
avoiding the vast majority of these problems. If the problems
1009
don't apply to you, pass in False for markupMassage, and
1010
you'll get better performance.
1012
The default parser massage techniques fix the two most common
1013
instances of invalid HTML that choke sgmllib:
1015
<br/> (No space between name of closing tag and tag close)
1016
<! --Comment--> (Extraneous whitespace in declaration)
1018
You can pass in a custom list of (RE object, replace method)
1019
tuples to get Beautiful Soup to scrub your input the way you
1022
self.parseOnlyThese = parseOnlyThese
1023
self.fromEncoding = fromEncoding
1024
self.smartQuotesTo = smartQuotesTo
1025
self.convertEntities = convertEntities
1026
# Set the rules for how we'll deal with the entities we
1028
if self.convertEntities:
1029
# It doesn't make sense to convert encoded characters to
1030
# entities even while you're converting entities to Unicode.
1031
# Just convert it all to Unicode.
1032
self.smartQuotesTo = None
1033
if convertEntities == self.HTML_ENTITIES:
1034
self.convertXMLEntities = False
1035
self.convertHTMLEntities = True
1036
self.escapeUnrecognizedEntities = True
1037
elif convertEntities == self.XHTML_ENTITIES:
1038
self.convertXMLEntities = True
1039
self.convertHTMLEntities = True
1040
self.escapeUnrecognizedEntities = False
1041
elif convertEntities == self.XML_ENTITIES:
1042
self.convertXMLEntities = True
1043
self.convertHTMLEntities = False
1044
self.escapeUnrecognizedEntities = False
1046
self.convertXMLEntities = False
1047
self.convertHTMLEntities = False
1048
self.escapeUnrecognizedEntities = False
1050
self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1051
SGMLParser.__init__(self)
1053
if hasattr(markup, 'read'): # It's a file-type object.
1054
markup = markup.read()
1055
self.markup = markup
1056
self.markupMassage = markupMassage
1061
self.markup = None # The markup can now be GCed
1063
def convert_charref(self, name):
1064
"""This method fixes a bug in Python's SGMLParser."""
1069
if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1071
return self.convert_codepoint(n)
1073
def _feed(self, inDocumentEncoding=None):
1074
# Convert the document to Unicode.
1075
markup = self.markup
1076
if isinstance(markup, unicode):
1077
if not hasattr(self, 'originalEncoding'):
1078
self.originalEncoding = None
1080
dammit = UnicodeDammit\
1081
(markup, [self.fromEncoding, inDocumentEncoding],
1082
smartQuotesTo=self.smartQuotesTo)
1083
markup = dammit.unicode
1084
self.originalEncoding = dammit.originalEncoding
1086
if self.markupMassage:
1087
if not isList(self.markupMassage):
1088
self.markupMassage = self.MARKUP_MASSAGE
1089
for fix, m in self.markupMassage:
1090
markup = fix.sub(m, markup)
1091
# TODO: We get rid of markupMassage so that the
1092
# soup object can be deepcopied later on. Some
1093
# Python installations can't copy regexes. If anyone
1094
# was relying on the existence of markupMassage, this
1095
# might cause problems.
1096
del(self.markupMassage)
1099
SGMLParser.feed(self, markup)
1100
# Close out any unfinished strings and close all the open tags.
1102
while self.currentTag.name != self.ROOT_TAG_NAME:
1105
def __getattr__(self, methodName):
1106
"""This method routes method call requests to either the SGMLParser
1107
superclass or the Tag superclass, depending on the method name."""
1108
#print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1110
if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
1111
or methodName.find('do_') == 0:
1112
return SGMLParser.__getattr__(self, methodName)
1113
elif methodName.find('__') != 0:
1114
return Tag.__getattr__(self, methodName)
1116
raise AttributeError
1118
def isSelfClosingTag(self, name):
1119
"""Returns true iff the given string is the name of a
1120
self-closing tag according to this parser."""
1121
return self.SELF_CLOSING_TAGS.has_key(name) \
1122
or self.instanceSelfClosingTags.has_key(name)
1125
Tag.__init__(self, self, self.ROOT_TAG_NAME)
1127
SGMLParser.reset(self)
1128
self.currentData = []
1129
self.currentTag = None
1131
self.quoteStack = []
1135
tag = self.tagStack.pop()
1136
# Tags with just one string-owning child get the child as a
1137
# 'string' property, so that soup.tag.string is shorthand for
1138
# soup.tag.contents[0]
1139
if len(self.currentTag.contents) == 1 and \
1140
isinstance(self.currentTag.contents[0], NavigableString):
1141
self.currentTag.string = self.currentTag.contents[0]
1143
#print "Pop", tag.name
1145
self.currentTag = self.tagStack[-1]
1146
return self.currentTag
1148
def pushTag(self, tag):
1149
#print "Push", tag.name
1151
self.currentTag.contents.append(tag)
1152
self.tagStack.append(tag)
1153
self.currentTag = self.tagStack[-1]
1155
def endData(self, containerClass=NavigableString):
1156
if self.currentData:
1157
currentData = ''.join(self.currentData)
1158
if not currentData.translate(self.STRIP_ASCII_SPACES):
1159
if '\n' in currentData:
1163
self.currentData = []
1164
if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1165
(not self.parseOnlyThese.text or \
1166
not self.parseOnlyThese.search(currentData)):
1168
o = containerClass(currentData)
1169
o.setup(self.currentTag, self.previous)
1171
self.previous.next = o
1173
self.currentTag.contents.append(o)
1176
def _popToTag(self, name, inclusivePop=True):
1177
"""Pops the tag stack up to and including the most recent
1178
instance of the given tag. If inclusivePop is false, pops the tag
1179
stack up to but *not* including the most recent instqance of
1181
#print "Popping to %s" % name
1182
if name == self.ROOT_TAG_NAME:
1186
mostRecentTag = None
1187
for i in range(len(self.tagStack)-1, 0, -1):
1188
if name == self.tagStack[i].name:
1189
numPops = len(self.tagStack)-i
1191
if not inclusivePop:
1192
numPops = numPops - 1
1194
for i in range(0, numPops):
1195
mostRecentTag = self.popTag()
1196
return mostRecentTag
1198
def _smartPop(self, name):
1200
"""We need to pop up to the previous tag of this type, unless
1201
one of this tag's nesting reset triggers comes between this
1202
tag and the previous tag of this type, OR unless this tag is a
1203
generic nesting trigger and another generic nesting trigger
1204
comes between this tag and the previous tag of this type.
1207
<p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1208
<p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1209
<p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1211
<li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1212
<tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1213
<td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1216
nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1217
isNestable = nestingResetTriggers != None
1218
isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1221
for i in range(len(self.tagStack)-1, 0, -1):
1222
p = self.tagStack[i]
1223
if (not p or p.name == name) and not isNestable:
1224
#Non-nestable tags get popped to the top or to their
1228
if (nestingResetTriggers != None
1229
and p.name in nestingResetTriggers) \
1230
or (nestingResetTriggers == None and isResetNesting
1231
and self.RESET_NESTING_TAGS.has_key(p.name)):
1233
#If we encounter one of the nesting reset triggers
1234
#peculiar to this tag, or we encounter another tag
1235
#that causes nesting to reset, pop up to but not
1236
#including that tag.
1242
self._popToTag(popTo, inclusive)
1244
def unknown_starttag(self, name, attrs, selfClosing=0):
1245
#print "Start tag %s: %s" % (name, attrs)
1247
#This is not a real tag.
1248
#print "<%s> is not real!" % name
1249
attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1250
self.handle_data('<%s%s>' % (name, attrs))
1254
if not self.isSelfClosingTag(name) and not selfClosing:
1255
self._smartPop(name)
1257
if self.parseOnlyThese and len(self.tagStack) <= 1 \
1258
and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1261
tag = Tag(self, name, attrs, self.currentTag, self.previous)
1263
self.previous.next = tag
1266
if selfClosing or self.isSelfClosingTag(name):
1268
if name in self.QUOTE_TAGS:
1269
#print "Beginning quote (%s)" % name
1270
self.quoteStack.append(name)
1274
def unknown_endtag(self, name):
1275
#print "End tag %s" % name
1276
if self.quoteStack and self.quoteStack[-1] != name:
1277
#This is not a real end tag.
1278
#print "</%s> is not real!" % name
1279
self.handle_data('</%s>' % name)
1282
self._popToTag(name)
1283
if self.quoteStack and self.quoteStack[-1] == name:
1284
self.quoteStack.pop()
1285
self.literal = (len(self.quoteStack) > 0)
1287
def handle_data(self, data):
1288
self.currentData.append(data)
1290
def _toStringSubclass(self, text, subclass):
1291
"""Adds a certain piece of text to the tree as a NavigableString
1294
self.handle_data(text)
1295
self.endData(subclass)
1297
def handle_pi(self, text):
1298
"""Handle a processing instruction as a ProcessingInstruction
1299
object, possibly one with a %SOUP-ENCODING% slot into which an
1300
encoding will be plugged later."""
1301
if text[:3] == "xml":
1302
text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1303
self._toStringSubclass(text, ProcessingInstruction)
1305
def handle_comment(self, text):
1306
"Handle comments as Comment objects."
1307
self._toStringSubclass(text, Comment)
1309
def handle_charref(self, ref):
1310
"Handle character references as data."
1311
if self.convertEntities:
1312
data = unichr(int(ref))
1314
data = '&#%s;' % ref
1315
self.handle_data(data)
1317
def handle_entityref(self, ref):
1318
"""Handle entity references as data, possibly converting known
1319
HTML and/or XML entity references to the corresponding Unicode
1322
if self.convertHTMLEntities:
1324
data = unichr(name2codepoint[ref])
1328
if not data and self.convertXMLEntities:
1329
data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1331
if not data and self.convertHTMLEntities and \
1332
not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1333
# TODO: We've got a problem here. We're told this is
1334
# an entity reference, but it's not an XML entity
1335
# reference or an HTML entity reference. Nonetheless,
1336
# the logical thing to do is to pass it through as an
1337
# unrecognized entity reference.
1339
# Except: when the input is "&carol;" this function
1340
# will be called with input "carol". When the input is
1341
# "AT&T", this function will be called with input
1342
# "T". We have no way of knowing whether a semicolon
1343
# was present originally, so we don't know whether
1344
# this is an unknown entity or just a misplaced
1347
# The more common case is a misplaced ampersand, so I
1348
# escape the ampersand and omit the trailing semicolon.
1349
data = "&%s" % ref
1351
# This case is different from the one above, because we
1352
# haven't already gone through a supposedly comprehensive
1353
# mapping of entities to Unicode characters. We might not
1354
# have gone through any mapping at all. So the chances are
1355
# very high that this is a real entity, and not a
1356
# misplaced ampersand.
1358
self.handle_data(data)
1360
def handle_decl(self, data):
1361
"Handle DOCTYPEs and the like as Declaration objects."
1362
self._toStringSubclass(data, Declaration)
1364
def parse_declaration(self, i):
1365
"""Treat a bogus SGML declaration as raw data. Treat a CDATA
1366
declaration as a CData object."""
1368
if self.rawdata[i:i+9] == '<![CDATA[':
1369
k = self.rawdata.find(']]>', i)
1371
k = len(self.rawdata)
1372
data = self.rawdata[i+9:k]
1374
self._toStringSubclass(data, CData)
1377
j = SGMLParser.parse_declaration(self, i)
1378
except SGMLParseError:
1379
toHandle = self.rawdata[i:]
1380
self.handle_data(toHandle)
1381
j = i + len(toHandle)
1384
class BeautifulSoup(BeautifulStoneSoup):
1386
"""This parser knows the following facts about HTML:
1388
* Some tags have no closing tag and should be interpreted as being
1389
closed as soon as they are encountered.
1391
* The text inside some tags (ie. 'script') may contain tags which
1392
are not really part of the document and which should be parsed
1393
as text, not tags. If you want to parse the text as tags, you can
1394
always fetch it and parse it explicitly.
1396
* Tag nesting rules:
1398
Most tags can't be nested at all. For instance, the occurance of
1399
a <p> tag should implicitly close the previous <p> tag.
1402
should be transformed into:
1403
<p>Para1</p><p>Para2
1405
Some tags can be nested arbitrarily. For instance, the occurance
1406
of a <blockquote> tag should _not_ implicitly close the previous
1409
Alice said: <blockquote>Bob said: <blockquote>Blah
1410
should NOT be transformed into:
1411
Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1413
Some tags can be nested, but the nesting is reset by the
1414
interposition of other tags. For instance, a <tr> tag should
1415
implicitly close the previous <tr> tag within the same <table>,
1416
but not close a <tr> tag in another table.
1418
<table><tr>Blah<tr>Blah
1419
should be transformed into:
1420
<table><tr>Blah</tr><tr>Blah
1422
<tr>Blah<table><tr>Blah
1423
should NOT be transformed into
1424
<tr>Blah<table></tr><tr>Blah
1426
Differing assumptions about tag nesting rules are a major source
1427
of problems with the BeautifulSoup class. If BeautifulSoup is not
1428
treating as nestable a tag your page author treats as nestable,
1429
try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1430
BeautifulStoneSoup before writing your own subclass."""
1432
def __init__(self, *args, **kwargs):
1433
if not kwargs.has_key('smartQuotesTo'):
1434
kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1435
BeautifulStoneSoup.__init__(self, *args, **kwargs)
1437
SELF_CLOSING_TAGS = buildTagMap(None,
1438
['br' , 'hr', 'input', 'img', 'meta',
1439
'spacer', 'link', 'frame', 'base'])
1441
QUOTE_TAGS = {'script' : None, 'textarea' : None}
1443
#According to the HTML standard, each of these inline tags can
1444
#contain another tag of the same type. Furthermore, it's common
1445
#to actually use these tags this way.
1446
NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1449
#According to the HTML standard, these block tags can contain
1450
#another tag of the same type. Furthermore, it's common
1451
#to actually use these tags this way.
1452
NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
1454
#Lists can contain other lists, but there are restrictions.
1455
NESTABLE_LIST_TAGS = { 'ol' : [],
1457
'li' : ['ul', 'ol'],
1462
#Tables can contain other tables, but there are restrictions.
1463
NESTABLE_TABLE_TAGS = {'table' : [],
1464
'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1467
'thead' : ['table'],
1468
'tbody' : ['table'],
1469
'tfoot' : ['table'],
1472
NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
1474
#If one of these tags is encountered, all tags up to the next tag of
1475
#this type are popped.
1476
RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1477
NON_NESTABLE_BLOCK_TAGS,
1479
NESTABLE_TABLE_TAGS)
1481
NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1482
NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1484
# Used to detect the charset in a META tag; see start_meta
1485
CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
1487
def start_meta(self, attrs):
1488
"""Beautiful Soup can detect a charset included in a META tag,
1489
try to convert the document to that charset, and re-parse the
1490
document from the beginning."""
1493
contentTypeIndex = None
1494
tagNeedsEncodingSubstitution = False
1496
for i in range(0, len(attrs)):
1497
key, value = attrs[i]
1499
if key == 'http-equiv':
1501
elif key == 'content':
1503
contentTypeIndex = i
1505
if httpEquiv and contentType: # It's an interesting meta tag.
1506
match = self.CHARSET_RE.search(contentType)
1508
if getattr(self, 'declaredHTMLEncoding') or \
1509
(self.originalEncoding == self.fromEncoding):
1510
# This is our second pass through the document, or
1511
# else an encoding was specified explicitly and it
1512
# worked. Rewrite the meta tag.
1513
newAttr = self.CHARSET_RE.sub\
1514
(lambda(match):match.group(1) +
1515
"%SOUP-ENCODING%", value)
1516
attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1518
tagNeedsEncodingSubstitution = True
1520
# This is our first pass through the document.
1521
# Go through it again with the new information.
1522
newCharset = match.group(3)
1523
if newCharset and newCharset != self.originalEncoding:
1524
self.declaredHTMLEncoding = newCharset
1525
self._feed(self.declaredHTMLEncoding)
1527
tag = self.unknown_starttag("meta", attrs)
1528
if tag and tagNeedsEncodingSubstitution:
1529
tag.containsSubstitutions = True
1531
class StopParsing(Exception):
1534
class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1536
"""The BeautifulSoup class is oriented towards skipping over
1537
common HTML errors like unclosed tags. However, sometimes it makes
1538
errors of its own. For instance, consider this fragment:
1540
<b>Foo<b>Bar</b></b>
1542
This is perfectly valid (if bizarre) HTML. However, the
1543
BeautifulSoup class will implicitly close the first b tag when it
1544
encounters the second 'b'. It will think the author wrote
1545
"<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1546
there's no real-world reason to bold something that's already
1547
bold. When it encounters '</b></b>' it will close two more 'b'
1548
tags, for a grand total of three tags closed instead of two. This
1549
can throw off the rest of your document structure. The same is
1550
true of a number of other tags, listed below.
1552
It's much more common for someone to forget to close a 'b' tag
1553
than to actually use nested 'b' tags, and the BeautifulSoup class
1554
handles the common case. This class handles the not-co-common
1555
case: where you can't believe someone wrote what they did, but
1556
it's valid HTML and BeautifulSoup screwed up by assuming it
1559
I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1560
['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1561
'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1564
I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1566
NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1567
I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1568
I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1570
class MinimalSoup(BeautifulSoup):
1571
"""The MinimalSoup class is for parsing HTML that contains
1572
pathologically bad markup. It makes no assumptions about tag
1573
nesting, but it does know which tags are self-closing, that
1574
<script> tags contain Javascript and should not be parsed, that
1575
META tags may contain encoding information, and so on.
1577
This also makes it better for subclassing than BeautifulStoneSoup
1578
or BeautifulSoup."""
1580
RESET_NESTING_TAGS = buildTagMap('noscript')
1583
class BeautifulSOAP(BeautifulStoneSoup):
1584
"""This class will push a tag with only a single string child into
1585
the tag's parent as an attribute. The attribute's name is the tag
1586
name, and the value is the string child. An example should give
1587
the flavor of the change:
1589
<foo><bar>baz</bar></foo>
1591
<foo bar="baz"><bar>baz</bar></foo>
1593
You can then access fooTag['bar'] instead of fooTag.barTag.string.
1595
This is, of course, useful for scraping structures that tend to
1596
use subelements instead of attributes, such as SOAP messages. Note
1597
that it modifies its input, so don't print the modified version
1600
I'm not sure how many people really want to use this class; let me
1601
know if you do. Mainly I like the name."""
1604
if len(self.tagStack) > 1:
1605
tag = self.tagStack[-1]
1606
parent = self.tagStack[-2]
1607
parent._getAttrMap()
1608
if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1609
isinstance(tag.contents[0], NavigableString) and
1610
not parent.attrMap.has_key(tag.name)):
1611
parent[tag.name] = tag.contents[0]
1612
BeautifulStoneSoup.popTag(self)
1614
#Enterprise class names! It has come to our attention that some people
1615
#think the names of the Beautiful Soup parser classes are too silly
1616
#and "unprofessional" for use in enterprise screen-scraping. We feel
1617
#your pain! For such-minded folk, the Beautiful Soup Consortium And
1618
#All-Night Kosher Bakery recommends renaming this file to
1619
#"RobustParser.py" (or, in cases of extreme enterprisiness,
1620
#"RobustParserBeanInterface.class") and using the following
1621
#enterprise-friendly class aliases:
1622
class RobustXMLParser(BeautifulStoneSoup):
1624
class RobustHTMLParser(BeautifulSoup):
1626
class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1628
class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1630
class SimplifyingSOAPParser(BeautifulSOAP):
1633
######################################################
1635
# Bonus library: Unicode, Dammit
1637
# This class forces XML data into a standard format (usually to UTF-8
1638
# or Unicode). It is heavily based on code from Mark Pilgrim's
1639
# Universal Feed Parser. It does not rewrite the XML or HTML to
1640
# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1641
# (XML) and BeautifulSoup.start_meta (HTML).
1643
# Autodetects character encodings.
1644
# Download from http://chardet.feedparser.org/
1647
# import chardet.constants
1648
# chardet.constants._debug = 1
1652
# cjkcodecs and iconv_codec make Python know about more character encodings.
1653
# Both are available from http://cjkpython.i18n.org/
1654
# They're built in if you use Python 2.4.
1656
import cjkcodecs.aliases
1664
class UnicodeDammit:
1665
"""A class for detecting the encoding of a *ML document and
1666
converting it to a Unicode string. If the source encoding is
1667
windows-1252, can replace MS smart quotes with their HTML or XML
1670
# This dictionary maps commonly seen values for "charset" in HTML
1671
# meta tags to the corresponding Python codec names. It only covers
1672
# values that aren't in Python's aliases and can't be determined
1673
# by the heuristics in find_codec.
1674
CHARSET_ALIASES = { "macintosh" : "mac-roman",
1675
"x-sjis" : "shift-jis" }
1677
def __init__(self, markup, overrideEncodings=[],
1678
smartQuotesTo='xml'):
1679
self.markup, documentEncoding, sniffedEncoding = \
1680
self._detectEncoding(markup)
1681
self.smartQuotesTo = smartQuotesTo
1682
self.triedEncodings = []
1683
if markup == '' or isinstance(markup, unicode):
1684
self.originalEncoding = None
1685
self.unicode = unicode(markup)
1689
for proposedEncoding in overrideEncodings:
1690
u = self._convertFrom(proposedEncoding)
1693
for proposedEncoding in (documentEncoding, sniffedEncoding):
1694
u = self._convertFrom(proposedEncoding)
1697
# If no luck and we have auto-detection library, try that:
1698
if not u and chardet and not isinstance(self.markup, unicode):
1699
u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1701
# As a last resort, try utf-8 and windows-1252:
1703
for proposed_encoding in ("utf-8", "windows-1252"):
1704
u = self._convertFrom(proposed_encoding)
1707
if not u: self.originalEncoding = None
1709
def _subMSChar(self, orig):
1710
"""Changes a MS smart quote character to an XML or HTML
1712
sub = self.MS_CHARS.get(orig)
1713
if type(sub) == types.TupleType:
1714
if self.smartQuotesTo == 'xml':
1715
sub = '&#x%s;' % sub[1]
1717
sub = '&%s;' % sub[0]
1720
def _convertFrom(self, proposed):
1721
proposed = self.find_codec(proposed)
1722
if not proposed or proposed in self.triedEncodings:
1724
self.triedEncodings.append(proposed)
1725
markup = self.markup
1727
# Convert smart quotes to HTML if coming from an encoding
1728
# that might have them.
1729
if self.smartQuotesTo and proposed.lower() in("windows-1252",
1732
markup = re.compile("([\x80-\x9f])").sub \
1733
(lambda(x): self._subMSChar(x.group(1)),
1737
# print "Trying to convert document to %s" % proposed
1738
u = self._toUnicode(markup, proposed)
1740
self.originalEncoding = proposed
1741
except Exception, e:
1742
# print "That didn't work!"
1745
#print "Correct encoding: %s" % proposed
1748
def _toUnicode(self, data, encoding):
1749
'''Given a string and its encoding, decodes the string into Unicode.
1750
%encoding is a string recognized by encodings.aliases'''
1752
# strip Byte Order Mark (if present)
1753
if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1754
and (data[2:4] != '\x00\x00'):
1755
encoding = 'utf-16be'
1757
elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1758
and (data[2:4] != '\x00\x00'):
1759
encoding = 'utf-16le'
1761
elif data[:3] == '\xef\xbb\xbf':
1764
elif data[:4] == '\x00\x00\xfe\xff':
1765
encoding = 'utf-32be'
1767
elif data[:4] == '\xff\xfe\x00\x00':
1768
encoding = 'utf-32le'
1770
newdata = unicode(data, encoding)
1773
def _detectEncoding(self, xml_data):
1774
"""Given a document, tries to detect its XML encoding."""
1775
xml_encoding = sniffed_xml_encoding = None
1777
if xml_data[:4] == '\x4c\x6f\xa7\x94':
1779
xml_data = self._ebcdic_to_ascii(xml_data)
1780
elif xml_data[:4] == '\x00\x3c\x00\x3f':
1782
sniffed_xml_encoding = 'utf-16be'
1783
xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1784
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1785
and (xml_data[2:4] != '\x00\x00'):
1787
sniffed_xml_encoding = 'utf-16be'
1788
xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1789
elif xml_data[:4] == '\x3c\x00\x3f\x00':
1791
sniffed_xml_encoding = 'utf-16le'
1792
xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1793
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1794
(xml_data[2:4] != '\x00\x00'):
1796
sniffed_xml_encoding = 'utf-16le'
1797
xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1798
elif xml_data[:4] == '\x00\x00\x00\x3c':
1800
sniffed_xml_encoding = 'utf-32be'
1801
xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1802
elif xml_data[:4] == '\x3c\x00\x00\x00':
1804
sniffed_xml_encoding = 'utf-32le'
1805
xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1806
elif xml_data[:4] == '\x00\x00\xfe\xff':
1808
sniffed_xml_encoding = 'utf-32be'
1809
xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1810
elif xml_data[:4] == '\xff\xfe\x00\x00':
1812
sniffed_xml_encoding = 'utf-32le'
1813
xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1814
elif xml_data[:3] == '\xef\xbb\xbf':
1816
sniffed_xml_encoding = 'utf-8'
1817
xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1819
sniffed_xml_encoding = 'ascii'
1821
xml_encoding_match = re.compile \
1822
('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
1825
xml_encoding_match = None
1826
if xml_encoding_match:
1827
xml_encoding = xml_encoding_match.groups()[0].lower()
1828
if sniffed_xml_encoding and \
1829
(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1830
'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1831
'utf-16', 'utf-32', 'utf_16', 'utf_32',
1833
xml_encoding = sniffed_xml_encoding
1834
return xml_data, xml_encoding, sniffed_xml_encoding
1837
def find_codec(self, charset):
1838
return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1839
or (charset and self._codec(charset.replace("-", ""))) \
1840
or (charset and self._codec(charset.replace("-", "_"))) \
1843
def _codec(self, charset):
1844
if not charset: return charset
1847
codecs.lookup(charset)
1849
except (LookupError, ValueError):
1853
EBCDIC_TO_ASCII_MAP = None
1854
def _ebcdic_to_ascii(self, s):
1856
if not c.EBCDIC_TO_ASCII_MAP:
1857
emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1858
16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1859
128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1860
144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1861
32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1862
38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1863
45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1864
186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1865
195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1866
201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1867
206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1868
211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1869
225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1870
73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1871
82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1872
90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1873
250,251,252,253,254,255)
1875
c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1876
''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1877
return s.translate(c.EBCDIC_TO_ASCII_MAP)
1879
MS_CHARS = { '\x80' : ('euro', '20AC'),
1881
'\x82' : ('sbquo', '201A'),
1882
'\x83' : ('fnof', '192'),
1883
'\x84' : ('bdquo', '201E'),
1884
'\x85' : ('hellip', '2026'),
1885
'\x86' : ('dagger', '2020'),
1886
'\x87' : ('Dagger', '2021'),
1887
'\x88' : ('circ', '2C6'),
1888
'\x89' : ('permil', '2030'),
1889
'\x8A' : ('Scaron', '160'),
1890
'\x8B' : ('lsaquo', '2039'),
1891
'\x8C' : ('OElig', '152'),
1893
'\x8E' : ('#x17D', '17D'),
1896
'\x91' : ('lsquo', '2018'),
1897
'\x92' : ('rsquo', '2019'),
1898
'\x93' : ('ldquo', '201C'),
1899
'\x94' : ('rdquo', '201D'),
1900
'\x95' : ('bull', '2022'),
1901
'\x96' : ('ndash', '2013'),
1902
'\x97' : ('mdash', '2014'),
1903
'\x98' : ('tilde', '2DC'),
1904
'\x99' : ('trade', '2122'),
1905
'\x9a' : ('scaron', '161'),
1906
'\x9b' : ('rsaquo', '203A'),
1907
'\x9c' : ('oelig', '153'),
1909
'\x9e' : ('#x17E', '17E'),
1910
'\x9f' : ('Yuml', ''),}
1912
#######################################################################
1915
#By default, act as an HTML pretty-printer.
1916
if __name__ == '__main__':
1918
soup = BeautifulSoup(sys.stdin.read())
1919
print soup.prettify()