3
"The Screen-Scraper's Friend"
4
http://www.crummy.com/software/BeautifulSoup/
6
Beautiful Soup parses a (possibly invalid) XML or HTML document into a
7
tree representation. It provides methods and Pythonic idioms that make
8
it easy to navigate, search, and modify the tree.
10
A well-formed XML/HTML document yields a well-formed data
11
structure. An ill-formed XML/HTML document yields a correspondingly
12
ill-formed data structure. If your document is only locally
13
well-formed, you can use this library to find and process the
14
well-formed part of it.
16
Beautiful Soup works with Python 2.2 and up. It has no external
17
dependencies, but you'll have more success at converting data to UTF-8
18
if you also install these three packages:
20
* chardet, for auto-detecting character encodings
21
http://chardet.feedparser.org/
22
* cjkcodecs and iconv_codec, which add more encodings to the ones supported
24
http://cjkpython.i18n.org/
26
Beautiful Soup defines classes for two main parsing strategies:
28
* BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
29
language that kind of looks like XML.
31
* BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
32
or invalid. This class has web browser-like heuristics for
33
obtaining a sensible parse tree in the face of common HTML errors.
35
Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
36
the encoding of an HTML or XML document, and converting it to
37
Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
39
For more than you ever wanted to know about Beautiful Soup, see the
41
http://www.crummy.com/software/BeautifulSoup/documentation.html
43
Here, have some legalese:
45
Copyright (c) 2004-2007, Leonard Richardson
49
Redistribution and use in source and binary forms, with or without
50
modification, are permitted provided that the following conditions are
53
* Redistributions of source code must retain the above copyright
54
notice, this list of conditions and the following disclaimer.
56
* Redistributions in binary form must reproduce the above
57
copyright notice, this list of conditions and the following
58
disclaimer in the documentation and/or other materials provided
59
with the distribution.
61
* Neither the name of the the Beautiful Soup Consortium and All
62
Night Kosher Bakery nor the names of its contributors may be
63
used to endorse or promote products derived from this software
64
without specific prior written permission.
66
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
67
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
68
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
69
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
70
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
71
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
72
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
73
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
74
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
75
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
76
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
79
from __future__ import generators
81
__author__ = "Leonard Richardson (leonardr@segfault.org)"
83
__copyright__ = "Copyright (c) 2004-2007 Leonard Richardson"
84
__license__ = "New-style BSD"
86
from calibre.ebooks.sgmllib import SGMLParser, SGMLParseError
90
import calibre.ebooks.sgmllib as sgmllib
91
from htmlentitydefs import name2codepoint
93
#This hack makes Beautiful Soup able to parse XML with namespaces
94
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
96
DEFAULT_OUTPUT_ENCODING = "utf-8"
98
# First, the classes that represent markup elements.
101
"""Contains the navigational information for some part of the page
102
(either a tag or a piece of text)"""
104
def setup(self, parent=None, previous=None):
105
"""Sets up the initial relations between this element and
108
self.previous = previous
110
self.previousSibling = None
111
self.nextSibling = None
112
if self.parent and self.parent.contents:
113
self.previousSibling = self.parent.contents[-1]
114
self.previousSibling.nextSibling = self
116
def replaceWith(self, replaceWith):
117
oldParent = self.parent
118
myIndex = self.parent.contents.index(self)
119
if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:
120
# We're replacing this element with one of its siblings.
121
index = self.parent.contents.index(replaceWith)
122
if index and index < myIndex:
123
# Furthermore, it comes before this element. That
124
# means that when we extract it, the index of this
125
# element will change.
126
myIndex = myIndex - 1
128
oldParent.insert(myIndex, replaceWith)
131
"""Destructively rips this element out of the tree."""
134
self.parent.contents.remove(self)
138
#Find the two elements that would be next to each other if
139
#this element (and any children) hadn't been parsed. Connect
141
lastChild = self._lastRecursiveChild()
142
nextElement = lastChild.next
145
self.previous.next = nextElement
147
nextElement.previous = self.previous
149
lastChild.next = None
152
if self.previousSibling:
153
self.previousSibling.nextSibling = self.nextSibling
155
self.nextSibling.previousSibling = self.previousSibling
156
self.previousSibling = self.nextSibling = None
158
def _lastRecursiveChild(self):
159
"Finds the last element beneath this object to be parsed."
161
while hasattr(lastChild, 'contents') and lastChild.contents:
162
lastChild = lastChild.contents[-1]
165
def insert(self, position, newChild):
166
if (isinstance(newChild, basestring)
167
or isinstance(newChild, unicode)) \
168
and not isinstance(newChild, NavigableString):
169
newChild = NavigableString(newChild)
171
position = min(position, len(self.contents))
172
if hasattr(newChild, 'parent') and newChild.parent != None:
173
# We're 'inserting' an element that's already one
174
# of this object's children.
175
if newChild.parent == self:
176
index = self.find(newChild)
177
if index and index < position:
178
# Furthermore we're moving it further down the
179
# list of this object's children. That means that
180
# when we extract this element, our target index
181
# will jump down one.
182
position = position - 1
185
newChild.parent = self
188
newChild.previousSibling = None
189
newChild.previous = self
191
previousChild = self.contents[position-1]
192
newChild.previousSibling = previousChild
193
newChild.previousSibling.nextSibling = newChild
194
newChild.previous = previousChild._lastRecursiveChild()
195
if newChild.previous:
196
newChild.previous.next = newChild
198
newChildsLastElement = newChild._lastRecursiveChild()
200
if position >= len(self.contents):
201
newChild.nextSibling = None
204
parentsNextSibling = None
205
while not parentsNextSibling:
206
parentsNextSibling = parent.nextSibling
207
parent = parent.parent
208
if not parent: # This is the last element in the document.
210
if parentsNextSibling:
211
newChildsLastElement.next = parentsNextSibling
213
newChildsLastElement.next = None
215
nextChild = self.contents[position]
216
newChild.nextSibling = nextChild
217
if newChild.nextSibling:
218
newChild.nextSibling.previousSibling = newChild
219
newChildsLastElement.next = nextChild
221
if newChildsLastElement.next:
222
newChildsLastElement.next.previous = newChildsLastElement
223
self.contents.insert(position, newChild)
225
def append(self, tag):
226
"""Appends the given tag to the contents of this tag."""
227
self.insert(len(self.contents), tag)
229
def findNext(self, name=None, attrs={}, text=None, **kwargs):
230
"""Returns the first item that matches the given criteria and
231
appears after this Tag in the document."""
232
return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
234
def findAllNext(self, name=None, attrs={}, text=None, limit=None,
236
"""Returns all items that match the given criteria and appear
237
before after Tag in the document."""
238
return self._findAll(name, attrs, text, limit, self.nextGenerator)
240
def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
241
"""Returns the closest sibling to this Tag that matches the
242
given criteria and appears after this Tag in the document."""
243
return self._findOne(self.findNextSiblings, name, attrs, text,
246
def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
248
"""Returns the siblings of this Tag that match the given
249
criteria and appear after this Tag in the document."""
250
return self._findAll(name, attrs, text, limit,
251
self.nextSiblingGenerator, **kwargs)
252
fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
254
def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
255
"""Returns the first item that matches the given criteria and
256
appears before this Tag in the document."""
257
return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
259
def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
261
"""Returns all items that match the given criteria and appear
262
before this Tag in the document."""
263
return self._findAll(name, attrs, text, limit, self.previousGenerator,
265
fetchPrevious = findAllPrevious # Compatibility with pre-3.x
267
def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
268
"""Returns the closest sibling to this Tag that matches the
269
given criteria and appears before this Tag in the document."""
270
return self._findOne(self.findPreviousSiblings, name, attrs, text,
273
def findPreviousSiblings(self, name=None, attrs={}, text=None,
274
limit=None, **kwargs):
275
"""Returns the siblings of this Tag that match the given
276
criteria and appear before this Tag in the document."""
277
return self._findAll(name, attrs, text, limit,
278
self.previousSiblingGenerator, **kwargs)
279
fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
281
def findParent(self, name=None, attrs={}, **kwargs):
282
"""Returns the closest parent of this Tag that matches the given
284
# NOTE: We can't use _findOne because findParents takes a different
287
l = self.findParents(name, attrs, 1)
292
def findParents(self, name=None, attrs={}, limit=None, **kwargs):
293
"""Returns the parents of this Tag that match the given
296
return self._findAll(name, attrs, None, limit, self.parentGenerator,
298
fetchParents = findParents # Compatibility with pre-3.x
300
#These methods do the real heavy lifting.
302
def _findOne(self, method, name, attrs, text, **kwargs):
304
l = method(name, attrs, text, 1, **kwargs)
309
def _findAll(self, name, attrs, text, limit, generator, **kwargs):
310
"Iterates over a generator looking for things that match."
312
if isinstance(name, SoupStrainer):
315
# Build a SoupStrainer
316
strainer = SoupStrainer(name, attrs, text, **kwargs)
317
results = ResultSet(strainer)
322
except StopIteration:
325
found = strainer.search(i)
327
results.append(found)
328
if limit and len(results) >= limit:
332
#These Generators can be used to navigate starting from both
333
#NavigableStrings and Tags.
334
def nextGenerator(self):
340
def nextSiblingGenerator(self):
346
def previousGenerator(self):
352
def previousSiblingGenerator(self):
355
i = i.previousSibling
358
def parentGenerator(self):
365
def substituteEncoding(self, str, encoding=None):
366
encoding = encoding or "utf-8"
367
return str.replace("%SOUP-ENCODING%", encoding)
369
def toEncoding(self, s, encoding=None):
370
"""Encodes an object to a string in some encoding, or to Unicode.
372
if isinstance(s, unicode):
374
s = s.encode(encoding)
375
elif isinstance(s, str):
377
s = s.encode(encoding)
382
s = self.toEncoding(str(s), encoding)
387
class NavigableString(unicode, PageElement):
389
def __getnewargs__(self):
390
return (NavigableString.__str__(self),)
392
def __getattr__(self, attr):
393
"""text.string gives you text. This is for backwards
394
compatibility for Navigable*String, but for CData* it lets you
395
get the string without the CData wrapper."""
399
raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
401
def __unicode__(self):
402
return unicode(str(self), DEFAULT_OUTPUT_ENCODING) # Changed by Kovid
404
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
406
return self.encode(encoding)
410
class CData(NavigableString):
412
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
413
return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
415
class ProcessingInstruction(NavigableString):
416
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
418
if "%SOUP-ENCODING%" in output:
419
output = self.substituteEncoding(output, encoding)
420
return "<?%s?>" % self.toEncoding(output, encoding)
422
class Comment(NavigableString):
423
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
424
return "<!--%s-->" % NavigableString.__str__(self, encoding)
426
class Declaration(NavigableString):
427
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
428
return "<!%s>" % NavigableString.__str__(self, encoding)
430
class Tag(PageElement):
432
"""Represents a found HTML tag with its attributes and contents."""
435
"Cheap function to invert a hash."
437
for k,v in h.items():
441
XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
447
XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
449
def _convertEntities(self, match):
450
"""Used in a call to re.sub to replace HTML, XML, and numeric
451
entities with the appropriate Unicode characters. If HTML
452
entities are being converted, any unrecognized entities are
455
if self.convertHTMLEntities and x in name2codepoint:
456
return unichr(name2codepoint[x])
457
elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
458
if self.convertXMLEntities:
459
return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
462
elif len(x) > 0 and x[0] == '#':
463
# Handle numeric entities
464
if len(x) > 1 and x[1] == 'x':
465
return unichr(int(x[2:], 16))
467
return unichr(int(x[1:]))
469
elif self.escapeUnrecognizedEntities:
470
return u'&%s;' % x
474
def __init__(self, parser, name, attrs=None, parent=None,
478
# We don't actually store the parser object: that lets extracted
479
# chunks be garbage-collected
480
self.parserClass = parser.__class__
481
self.isSelfClosing = parser.isSelfClosingTag(name)
487
self.setup(parent, previous)
489
self.containsSubstitutions = False
490
self.convertHTMLEntities = parser.convertHTMLEntities
491
self.convertXMLEntities = parser.convertXMLEntities
492
self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
494
# Convert any HTML, XML, or numeric entities in the attribute values.
495
convert = lambda(k, val): (k,
496
re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",
497
self._convertEntities,
499
self.attrs = map(convert, self.attrs)
501
def get(self, key, default=None):
502
"""Returns the value of the 'key' attribute for the tag, or
503
the value given for 'default' if it doesn't have that
505
return self._getAttrMap().get(key, default)
507
def has_key(self, key):
508
return self._getAttrMap().has_key(key)
510
def __getitem__(self, key):
511
"""tag[key] returns the value of the 'key' attribute for the tag,
512
and throws an exception if it's not there."""
513
return self._getAttrMap()[key]
516
"Iterating over a tag iterates over its contents."
517
return iter(self.contents)
520
"The length of a tag is the length of its list of contents."
521
return len(self.contents)
523
def __contains__(self, x):
524
return x in self.contents
526
def __nonzero__(self):
527
"A tag is non-None even if it has no contents."
530
def __setitem__(self, key, value):
531
"""Setting tag[key] sets the value of the 'key' attribute for the
534
self.attrMap[key] = value
536
for i in range(0, len(self.attrs)):
537
if self.attrs[i][0] == key:
538
self.attrs[i] = (key, value)
541
self.attrs.append((key, value))
542
self._getAttrMap()[key] = value
544
def __delitem__(self, key):
545
"Deleting tag[key] deletes all 'key' attributes for the tag."
546
for item in self.attrs:
548
self.attrs.remove(item)
549
#We don't break because bad HTML can define the same
550
#attribute multiple times.
552
if self.attrMap.has_key(key):
553
del self.attrMap[key]
555
def __call__(self, *args, **kwargs):
556
"""Calling a tag like a function is the same as calling its
557
findAll() method. Eg. tag('a') returns a list of all the A tags
558
found within this tag."""
559
return apply(self.findAll, args, kwargs)
561
def __getattr__(self, tag):
562
#print "Getattr %s.%s" % (self.__class__, tag)
563
if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
564
return self.find(tag[:-3])
565
elif tag.find('__') != 0:
566
return self.find(tag)
567
raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
569
def __eq__(self, other):
570
"""Returns true iff this tag has the same name, the same attributes,
571
and the same contents (recursively) as the given tag.
573
NOTE: right now this will return false if two tags have the
574
same attributes in a different order. Should this be fixed?"""
575
if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
577
for i in range(0, len(self.contents)):
578
if self.contents[i] != other.contents[i]:
582
def __ne__(self, other):
583
"""Returns true iff this tag is not identical to the other tag,
584
as defined in __eq__."""
585
return not self == other
587
def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
588
"""Renders this tag as a string."""
589
return self.__str__(encoding)
591
def __unicode__(self):
592
return self.__str__(None)
594
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
595
+ "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
598
def _sub_entity(self, x):
599
"""Used with a regular expression to substitute the
600
appropriate XML entity for an XML special character."""
601
return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
603
def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
604
prettyPrint=False, indentLevel=0):
605
"""Returns a string or Unicode representation of this tag and
606
its contents. To get Unicode, pass None for encoding.
608
NOTE: since Python's HTML parser consumes whitespace, this
609
method is not certain to reproduce the whitespace present in
610
the original string."""
612
encodedName = self.toEncoding(self.name, encoding)
616
for key, val in self.attrs:
619
if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
620
val = self.substituteEncoding(val, encoding)
622
# The attribute value either:
624
# * Contains no embedded double quotes or single quotes.
625
# No problem: we enclose it in double quotes.
626
# * Contains embedded single quotes. No problem:
627
# double quotes work here too.
628
# * Contains embedded double quotes. No problem:
629
# we enclose it in single quotes.
630
# * Embeds both single _and_ double quotes. This
631
# can't happen naturally, but it can happen if
632
# you modify an attribute value after parsing
633
# the document. Now we have a bit of a
634
# problem. We solve it by enclosing the
635
# attribute in single quotes, and escaping any
636
# embedded single quotes to XML entities.
640
# TODO: replace with apos when
642
val = val.replace("'", "&squot;")
644
# Now we're okay w/r/t quotes. But the attribute
645
# value might also contain angle brackets, or
646
# ampersands that aren't part of entities. We need
647
# to escape those to XML entities too.
648
val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
650
attrs.append(fmt % (self.toEncoding(key, encoding),
651
self.toEncoding(val, encoding)))
654
if self.isSelfClosing:
657
closeTag = '</%s>' % encodedName
659
indentTag, indentContents = 0, 0
661
indentTag = indentLevel
662
space = (' ' * (indentTag-1))
663
indentContents = indentTag + 1
664
contents = self.renderContents(encoding, prettyPrint, indentContents)
671
attributeString = ' ' + ' '.join(attrs)
674
s.append('<%s%s%s>' % (encodedName, attributeString, close))
678
if prettyPrint and contents and contents[-1] != "\n":
680
if prettyPrint and closeTag:
683
if prettyPrint and closeTag and self.nextSibling:
688
def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
689
return self.__str__(encoding, True)
691
def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
692
prettyPrint=False, indentLevel=0):
693
"""Renders the contents of this tag as a string in the given
694
encoding. If encoding is None, returns a Unicode string.."""
698
if isinstance(c, NavigableString):
699
text = c.__str__(encoding)
700
elif isinstance(c, Tag):
701
s.append(c.__str__(encoding, prettyPrint, indentLevel))
702
if text and prettyPrint:
706
s.append(" " * (indentLevel-1))
714
def find(self, name=None, attrs={}, recursive=True, text=None,
716
"""Return only the first child of this Tag matching the given
719
l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
725
def findAll(self, name=None, attrs={}, recursive=True, text=None,
726
limit=None, **kwargs):
727
"""Extracts a list of Tag objects that match the given
728
criteria. You can specify the name of the Tag and any
729
attributes you want the Tag to have.
731
The value of a key-value pair in the 'attrs' map can be a
732
string, a list of strings, a regular expression object, or a
733
callable that takes a string and returns whether or not the
734
string matches for some custom definition of 'matches'. The
735
same is true of the tag name."""
736
generator = self.recursiveChildGenerator
738
generator = self.childGenerator
739
return self._findAll(name, attrs, text, limit, generator, **kwargs)
740
findChildren = findAll
742
# Pre-3.x compatibility methods
746
def fetchText(self, text=None, recursive=True, limit=None):
747
return self.findAll(text=text, recursive=recursive, limit=limit)
749
def firstText(self, text=None, recursive=True):
750
return self.find(text=text, recursive=recursive)
754
def _getAttrMap(self):
755
"""Initializes a map representation of this tag's attributes,
756
if not already initialized."""
757
if not getattr(self, 'attrMap'):
759
for (key, value) in self.attrs:
760
self.attrMap[key] = value
764
def childGenerator(self):
765
for i in range(0, len(self.contents)):
766
yield self.contents[i]
769
def recursiveChildGenerator(self):
772
tag, start = stack.pop()
773
if isinstance(tag, Tag):
774
for i in range(start, len(tag.contents)):
777
if isinstance(a, Tag) and tag.contents:
778
if i < len(tag.contents) - 1:
779
stack.append((tag, i+1))
784
# Next, a couple classes to represent queries and their results.
786
"""Encapsulates a number of ways of matching a markup element (tag or
789
def __init__(self, name=None, attrs={}, text=None, **kwargs):
792
kwargs['class'] = attrs
807
return "%s|%s" % (self.name, self.attrs)
809
def searchTag(self, markupName=None, markupAttrs={}):
812
if isinstance(markupName, Tag):
815
callFunctionWithTagData = callable(self.name) \
816
and not isinstance(markupName, Tag)
819
or callFunctionWithTagData \
820
or (markup and self._matches(markup, self.name)) \
821
or (not markup and self._matches(markupName, self.name)):
822
if callFunctionWithTagData:
823
match = self.name(markupName, markupAttrs)
827
for attr, matchAgainst in self.attrs.items():
828
if not markupAttrMap:
829
if hasattr(markupAttrs, 'get'):
830
markupAttrMap = markupAttrs
833
for k,v in markupAttrs:
835
attrValue = markupAttrMap.get(attr)
836
if not self._matches(attrValue, matchAgainst):
846
def search(self, markup):
847
#print 'looking for %s in %s' % (self, markup)
849
# If given a list of items, scan it for a text element that
851
if isList(markup) and not isinstance(markup, Tag):
852
for element in markup:
853
if isinstance(element, NavigableString) \
854
and self.search(element):
857
# If it's a Tag, make sure its name or attributes match.
858
# Don't bother with Tags if we're searching for text.
859
elif isinstance(markup, Tag):
861
found = self.searchTag(markup)
862
# If it's text, make sure the text matches.
863
elif isinstance(markup, NavigableString) or \
865
if self._matches(markup, self.text):
868
raise Exception, "I don't know how to match against a %s" \
872
def _matches(self, markup, matchAgainst):
873
#print "Matching %s against %s" % (markup, matchAgainst)
875
if matchAgainst == True and type(matchAgainst) == types.BooleanType:
876
result = markup != None
877
elif callable(matchAgainst):
878
result = matchAgainst(markup)
880
#Custom match methods take the tag as an argument, but all
881
#other ways of matching match the tag name as a string.
882
if isinstance(markup, Tag):
884
if markup and not isString(markup):
885
markup = unicode(markup)
886
#Now we know that chunk is either a string, or None.
887
if hasattr(matchAgainst, 'match'):
888
# It's a regexp object.
889
result = markup and matchAgainst.search(markup)
890
elif isList(matchAgainst):
891
result = markup in matchAgainst
892
elif hasattr(matchAgainst, 'items'):
893
result = markup.has_key(matchAgainst)
894
elif matchAgainst and isString(markup):
895
if isinstance(markup, unicode):
896
matchAgainst = unicode(matchAgainst)
898
matchAgainst = str(matchAgainst)
901
result = matchAgainst == markup
904
class ResultSet(list):
905
"""A ResultSet is just a list that keeps track of the SoupStrainer
907
def __init__(self, source):
911
# Now, some helper functions.
914
"""Convenience method that works with all 2.x versions of Python
915
to determine whether or not something is listlike."""
916
return hasattr(l, '__iter__') \
917
or (type(l) in (types.ListType, types.TupleType))
920
"""Convenience method that works with all 2.x versions of Python
921
to determine whether or not something is stringlike."""
923
return isinstance(s, unicode) or isinstance(s, basestring)
925
return isinstance(s, str)
927
def buildTagMap(default, *args):
928
"""Turns a list of maps, lists, or scalars into a single map.
929
Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
930
NESTING_RESET_TAGS maps out of lists and partial maps."""
933
if hasattr(portion, 'items'):
934
#It's a map. Merge it.
935
for k,v in portion.items():
937
elif isList(portion):
938
#It's a list. Map each item to the default.
942
#It's a scalar. Map it to the default.
943
built[portion] = default
946
# Now, the parser classes.
948
class BeautifulStoneSoup(Tag, SGMLParser):
950
"""This class contains the basic parser and search code. It defines
951
a parser that knows nothing about tag behavior except for the
954
You can't close a tag without closing all the tags it encloses.
955
That is, "<foo><bar></foo>" actually means
956
"<foo><bar></bar></foo>".
958
[Another possible explanation is "<foo><bar /></foo>", but since
959
this class defines no SELF_CLOSING_TAGS, it will never use that
962
This class is useful for parsing XML or made-up markup languages,
963
or when BeautifulSoup makes an assumption counter to what you were
966
SELF_CLOSING_TAGS = {}
968
RESET_NESTING_TAGS = {}
971
MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
972
lambda x: x.group(1) + ' />'),
973
(re.compile('<!\s+([^<>]*)>'),
974
lambda x: '<!' + x.group(1) + '>')
977
ROOT_TAG_NAME = u'[document]'
979
HTML_ENTITIES = "html"
981
XHTML_ENTITIES = "xhtml"
982
# TODO: This only exists for backwards-compatibility
983
ALL_ENTITIES = XHTML_ENTITIES
985
# Used when determining whether a text node is all whitespace and
986
# can be replaced with a single space. A text node that contains
987
# fancy Unicode spaces (usually non-breaking) should be left
989
STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
991
def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
992
markupMassage=True, smartQuotesTo=XML_ENTITIES,
993
convertEntities=None, selfClosingTags=None):
994
"""The Soup object is initialized as the 'root tag', and the
995
provided markup (which can be a string or a file-like object)
996
is fed into the underlying parser.
998
sgmllib will process most bad HTML, and the BeautifulSoup
999
class has some tricks for dealing with some HTML that kills
1000
sgmllib, but Beautiful Soup can nonetheless choke or lose data
1001
if your data uses self-closing tags or declarations
1004
By default, Beautiful Soup uses regexes to sanitize input,
1005
avoiding the vast majority of these problems. If the problems
1006
don't apply to you, pass in False for markupMassage, and
1007
you'll get better performance.
1009
The default parser massage techniques fix the two most common
1010
instances of invalid HTML that choke sgmllib:
1012
<br/> (No space between name of closing tag and tag close)
1013
<! --Comment--> (Extraneous whitespace in declaration)
1015
You can pass in a custom list of (RE object, replace method)
1016
tuples to get Beautiful Soup to scrub your input the way you
1019
self.parseOnlyThese = parseOnlyThese
1020
self.fromEncoding = fromEncoding
1021
self.smartQuotesTo = smartQuotesTo
1022
self.convertEntities = convertEntities
1023
# Set the rules for how we'll deal with the entities we
1025
if self.convertEntities:
1026
# It doesn't make sense to convert encoded characters to
1027
# entities even while you're converting entities to Unicode.
1028
# Just convert it all to Unicode.
1029
self.smartQuotesTo = None
1030
if convertEntities == self.HTML_ENTITIES:
1031
self.convertXMLEntities = False
1032
self.convertHTMLEntities = True
1033
self.escapeUnrecognizedEntities = True
1034
elif convertEntities == self.XHTML_ENTITIES:
1035
self.convertXMLEntities = True
1036
self.convertHTMLEntities = True
1037
self.escapeUnrecognizedEntities = False
1038
elif convertEntities == self.XML_ENTITIES:
1039
self.convertXMLEntities = True
1040
self.convertHTMLEntities = False
1041
self.escapeUnrecognizedEntities = False
1043
self.convertXMLEntities = False
1044
self.convertHTMLEntities = False
1045
self.escapeUnrecognizedEntities = False
1047
self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
1048
SGMLParser.__init__(self)
1050
if hasattr(markup, 'read'): # It's a file-type object.
1051
markup = markup.read()
1052
self.markup = markup
1053
self.markupMassage = markupMassage
1058
self.markup = None # The markup can now be GCed
1060
def convert_charref(self, name):
1061
"""This method fixes a bug in Python's SGMLParser."""
1066
if not 0 <= n <= 127 : # ASCII ends at 127, not 255
1068
return self.convert_codepoint(n)
1070
def _feed(self, inDocumentEncoding=None):
1071
# Convert the document to Unicode.
1072
markup = self.markup
1073
if isinstance(markup, unicode):
1074
if not hasattr(self, 'originalEncoding'):
1075
self.originalEncoding = None
1077
# Changed detection by Kovid
1078
markup, self.originalEncoding = chardet.xml_to_unicode(markup)
1080
if self.markupMassage:
1081
if not isList(self.markupMassage):
1082
self.markupMassage = self.MARKUP_MASSAGE
1083
for fix, m in self.markupMassage:
1084
markup = fix.sub(m, markup)
1085
# TODO: We get rid of markupMassage so that the
1086
# soup object can be deepcopied later on. Some
1087
# Python installations can't copy regexes. If anyone
1088
# was relying on the existence of markupMassage, this
1089
# might cause problems.
1090
del(self.markupMassage)
1091
self.markup = markup
1094
SGMLParser.feed(self, markup)
1095
# Close out any unfinished strings and close all the open tags.
1097
while self.currentTag.name != self.ROOT_TAG_NAME:
1100
def __getattr__(self, methodName):
1101
"""This method routes method call requests to either the SGMLParser
1102
superclass or the Tag superclass, depending on the method name."""
1103
#print "__getattr__ called on %s.%s" % (self.__class__, methodName)
1105
if methodName.find('start_') == 0 or methodName.find('end_') == 0 \
1106
or methodName.find('do_') == 0:
1107
return SGMLParser.__getattr__(self, methodName)
1108
elif methodName.find('__') != 0:
1109
return Tag.__getattr__(self, methodName)
1111
raise AttributeError
1113
def isSelfClosingTag(self, name):
1114
"""Returns true iff the given string is the name of a
1115
self-closing tag according to this parser."""
1116
return self.SELF_CLOSING_TAGS.has_key(name) \
1117
or self.instanceSelfClosingTags.has_key(name)
1120
Tag.__init__(self, self, self.ROOT_TAG_NAME)
1122
SGMLParser.reset(self)
1123
self.currentData = []
1124
self.currentTag = None
1126
self.quoteStack = []
1130
tag = self.tagStack.pop()
1131
# Tags with just one string-owning child get the child as a
1132
# 'string' property, so that soup.tag.string is shorthand for
1133
# soup.tag.contents[0]
1134
if len(self.currentTag.contents) == 1 and \
1135
isinstance(self.currentTag.contents[0], NavigableString):
1136
self.currentTag.string = self.currentTag.contents[0]
1138
#print "Pop", tag.name
1140
self.currentTag = self.tagStack[-1]
1141
return self.currentTag
1143
def pushTag(self, tag):
1144
#print "Push", tag.name
1146
self.currentTag.contents.append(tag)
1147
self.tagStack.append(tag)
1148
self.currentTag = self.tagStack[-1]
1150
def endData(self, containerClass=NavigableString):
1151
if self.currentData:
1152
currentData = ''.join(self.currentData)
1153
if not currentData.translate(self.STRIP_ASCII_SPACES):
1154
if '\n' in currentData:
1158
self.currentData = []
1159
if self.parseOnlyThese and len(self.tagStack) <= 1 and \
1160
(not self.parseOnlyThese.text or \
1161
not self.parseOnlyThese.search(currentData)):
1163
o = containerClass(currentData)
1164
o.setup(self.currentTag, self.previous)
1166
self.previous.next = o
1168
self.currentTag.contents.append(o)
1171
def _popToTag(self, name, inclusivePop=True):
1172
"""Pops the tag stack up to and including the most recent
1173
instance of the given tag. If inclusivePop is false, pops the tag
1174
stack up to but *not* including the most recent instqance of
1176
#print "Popping to %s" % name
1177
if name == self.ROOT_TAG_NAME:
1181
mostRecentTag = None
1182
for i in range(len(self.tagStack)-1, 0, -1):
1183
if name == self.tagStack[i].name:
1184
numPops = len(self.tagStack)-i
1186
if not inclusivePop:
1187
numPops = numPops - 1
1189
for i in range(0, numPops):
1190
mostRecentTag = self.popTag()
1191
return mostRecentTag
1193
def _smartPop(self, name):
1195
"""We need to pop up to the previous tag of this type, unless
1196
one of this tag's nesting reset triggers comes between this
1197
tag and the previous tag of this type, OR unless this tag is a
1198
generic nesting trigger and another generic nesting trigger
1199
comes between this tag and the previous tag of this type.
1202
<p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.
1203
<p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.
1204
<p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.
1206
<li><ul><li> *<li>* should pop to 'ul', not the first 'li'.
1207
<tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'
1208
<td><tr><td> *<td>* should pop to 'tr', not the first 'td'
1211
nestingResetTriggers = self.NESTABLE_TAGS.get(name)
1212
isNestable = nestingResetTriggers != None
1213
isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
1216
for i in range(len(self.tagStack)-1, 0, -1):
1217
p = self.tagStack[i]
1218
if (not p or p.name == name) and not isNestable:
1219
#Non-nestable tags get popped to the top or to their
1223
if (nestingResetTriggers != None
1224
and p.name in nestingResetTriggers) \
1225
or (nestingResetTriggers == None and isResetNesting
1226
and self.RESET_NESTING_TAGS.has_key(p.name)):
1228
#If we encounter one of the nesting reset triggers
1229
#peculiar to this tag, or we encounter another tag
1230
#that causes nesting to reset, pop up to but not
1231
#including that tag.
1237
self._popToTag(popTo, inclusive)
1239
def unknown_starttag(self, name, attrs, selfClosing=0):
1240
#print "Start tag %s: %s" % (name, attrs)
1242
#This is not a real tag.
1243
#print "<%s> is not real!" % name
1244
attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))
1245
self.handle_data('<%s%s>' % (name, attrs))
1249
if not self.isSelfClosingTag(name) and not selfClosing:
1250
self._smartPop(name)
1252
if self.parseOnlyThese and len(self.tagStack) <= 1 \
1253
and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
1256
tag = Tag(self, name, attrs, self.currentTag, self.previous)
1258
self.previous.next = tag
1261
if selfClosing or self.isSelfClosingTag(name):
1263
if name in self.QUOTE_TAGS:
1264
#print "Beginning quote (%s)" % name
1265
self.quoteStack.append(name)
1269
def unknown_endtag(self, name):
1270
#print "End tag %s" % name
1271
if self.quoteStack and self.quoteStack[-1] != name:
1272
#This is not a real end tag.
1273
#print "</%s> is not real!" % name
1274
self.handle_data('</%s>' % name)
1277
self._popToTag(name)
1278
if self.quoteStack and self.quoteStack[-1] == name:
1279
self.quoteStack.pop()
1280
self.literal = (len(self.quoteStack) > 0)
1282
def handle_data(self, data):
1283
self.currentData.append(data)
1285
def _toStringSubclass(self, text, subclass):
1286
"""Adds a certain piece of text to the tree as a NavigableString
1289
self.handle_data(text)
1290
self.endData(subclass)
1292
def handle_pi(self, text):
1293
"""Handle a processing instruction as a ProcessingInstruction
1294
object, possibly one with a %SOUP-ENCODING% slot into which an
1295
encoding will be plugged later."""
1296
if text[:3] == "xml":
1297
text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
1298
self._toStringSubclass(text, ProcessingInstruction)
1300
def handle_comment(self, text):
1301
"Handle comments as Comment objects."
1302
self._toStringSubclass(text, Comment)
1304
def handle_charref(self, ref):
1305
"Handle character references as data."
1306
if self.convertEntities:
1307
if ref.lower().startswith('x'): #
1308
ref = int(ref[1:], 16) # Added by Kovid to handle hex numeric entities
1309
data = unichr(int(ref))
1311
data = '&#%s;' % ref
1312
self.handle_data(data)
1314
def handle_entityref(self, ref):
1315
"""Handle entity references as data, possibly converting known
1316
HTML and/or XML entity references to the corresponding Unicode
1319
if self.convertHTMLEntities:
1321
data = unichr(name2codepoint[ref])
1325
if not data and self.convertXMLEntities:
1326
data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
1328
if not data and self.convertHTMLEntities and \
1329
not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
1330
# TODO: We've got a problem here. We're told this is
1331
# an entity reference, but it's not an XML entity
1332
# reference or an HTML entity reference. Nonetheless,
1333
# the logical thing to do is to pass it through as an
1334
# unrecognized entity reference.
1336
# Except: when the input is "&carol;" this function
1337
# will be called with input "carol". When the input is
1338
# "AT&T", this function will be called with input
1339
# "T". We have no way of knowing whether a semicolon
1340
# was present originally, so we don't know whether
1341
# this is an unknown entity or just a misplaced
1344
# The more common case is a misplaced ampersand, so I
1345
# escape the ampersand and omit the trailing semicolon.
1346
data = "&%s" % ref
1348
# This case is different from the one above, because we
1349
# haven't already gone through a supposedly comprehensive
1350
# mapping of entities to Unicode characters. We might not
1351
# have gone through any mapping at all. So the chances are
1352
# very high that this is a real entity, and not a
1353
# misplaced ampersand.
1355
self.handle_data(data)
1357
def handle_decl(self, data):
1358
"Handle DOCTYPEs and the like as Declaration objects."
1359
self._toStringSubclass(data, Declaration)
1361
def parse_declaration(self, i):
1362
"""Treat a bogus SGML declaration as raw data. Treat a CDATA
1363
declaration as a CData object."""
1365
if self.rawdata[i:i+9] == '<![CDATA[':
1366
k = self.rawdata.find(']]>', i)
1368
k = len(self.rawdata)
1369
data = self.rawdata[i+9:k]
1371
self._toStringSubclass(data, CData)
1374
j = SGMLParser.parse_declaration(self, i)
1375
except SGMLParseError:
1376
toHandle = self.rawdata[i:]
1377
self.handle_data(toHandle)
1378
j = i + len(toHandle)
1381
class BeautifulSoup(BeautifulStoneSoup):
1383
"""This parser knows the following facts about HTML:
1385
* Some tags have no closing tag and should be interpreted as being
1386
closed as soon as they are encountered.
1388
* The text inside some tags (ie. 'script') may contain tags which
1389
are not really part of the document and which should be parsed
1390
as text, not tags. If you want to parse the text as tags, you can
1391
always fetch it and parse it explicitly.
1393
* Tag nesting rules:
1395
Most tags can't be nested at all. For instance, the occurance of
1396
a <p> tag should implicitly close the previous <p> tag.
1399
should be transformed into:
1400
<p>Para1</p><p>Para2
1402
Some tags can be nested arbitrarily. For instance, the occurance
1403
of a <blockquote> tag should _not_ implicitly close the previous
1406
Alice said: <blockquote>Bob said: <blockquote>Blah
1407
should NOT be transformed into:
1408
Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
1410
Some tags can be nested, but the nesting is reset by the
1411
interposition of other tags. For instance, a <tr> tag should
1412
implicitly close the previous <tr> tag within the same <table>,
1413
but not close a <tr> tag in another table.
1415
<table><tr>Blah<tr>Blah
1416
should be transformed into:
1417
<table><tr>Blah</tr><tr>Blah
1419
<tr>Blah<table><tr>Blah
1420
should NOT be transformed into
1421
<tr>Blah<table></tr><tr>Blah
1423
Differing assumptions about tag nesting rules are a major source
1424
of problems with the BeautifulSoup class. If BeautifulSoup is not
1425
treating as nestable a tag your page author treats as nestable,
1426
try ICantBelieveItsBeautifulSoup, MinimalSoup, or
1427
BeautifulStoneSoup before writing your own subclass."""
1429
def __init__(self, *args, **kwargs):
1430
if not kwargs.has_key('smartQuotesTo'):
1431
kwargs['smartQuotesTo'] = self.HTML_ENTITIES
1432
BeautifulStoneSoup.__init__(self, *args, **kwargs)
1434
SELF_CLOSING_TAGS = buildTagMap(None,
1435
['br' , 'hr', 'input', 'img', 'meta',
1436
'spacer', 'link', 'frame', 'base'])
1438
QUOTE_TAGS = {'script' : None, 'textarea' : None}
1440
#According to the HTML standard, each of these inline tags can
1441
#contain another tag of the same type. Furthermore, it's common
1442
#to actually use these tags this way.
1443
NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
1446
#According to the HTML standard, these block tags can contain
1447
#another tag of the same type. Furthermore, it's common
1448
#to actually use these tags this way.
1449
NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']
1451
#Lists can contain other lists, but there are restrictions.
1452
NESTABLE_LIST_TAGS = { 'ol' : [],
1454
'li' : ['ul', 'ol'],
1459
#Tables can contain other tables, but there are restrictions.
1460
NESTABLE_TABLE_TAGS = {'table' : [],
1461
'tr' : ['table', 'tbody', 'tfoot', 'thead'],
1464
'thead' : ['table'],
1465
'tbody' : ['table'],
1466
'tfoot' : ['table'],
1469
NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']
1471
#If one of these tags is encountered, all tags up to the next tag of
1472
#this type are popped.
1473
RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
1474
NON_NESTABLE_BLOCK_TAGS,
1476
NESTABLE_TABLE_TAGS)
1478
NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
1479
NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
1481
# Used to detect the charset in a META tag; see start_meta
1482
CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")
1484
def start_meta(self, attrs):
1485
"""Beautiful Soup can detect a charset included in a META tag,
1486
try to convert the document to that charset, and re-parse the
1487
document from the beginning."""
1490
contentTypeIndex = None
1491
tagNeedsEncodingSubstitution = False
1493
for i in range(0, len(attrs)):
1494
key, value = attrs[i]
1496
if key == 'http-equiv':
1498
elif key == 'content':
1500
contentTypeIndex = i
1502
if httpEquiv and contentType: # It's an interesting meta tag.
1503
match = self.CHARSET_RE.search(contentType)
1505
if getattr(self, 'declaredHTMLEncoding') or \
1506
(self.originalEncoding == self.fromEncoding):
1507
# This is our second pass through the document, or
1508
# else an encoding was specified explicitly and it
1509
# worked. Rewrite the meta tag.
1510
newAttr = self.CHARSET_RE.sub\
1511
(lambda(match):match.group(1) +
1512
"%SOUP-ENCODING%", value)
1513
attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
1515
tagNeedsEncodingSubstitution = True
1517
# This is our first pass through the document.
1518
# Go through it again with the new information.
1519
newCharset = match.group(3)
1520
if newCharset and newCharset != self.originalEncoding:
1521
self.declaredHTMLEncoding = newCharset
1522
self._feed(self.declaredHTMLEncoding)
1524
tag = self.unknown_starttag("meta", attrs)
1525
if tag and tagNeedsEncodingSubstitution:
1526
tag.containsSubstitutions = True
1528
class StopParsing(Exception):
1531
class ICantBelieveItsBeautifulSoup(BeautifulSoup):
1533
"""The BeautifulSoup class is oriented towards skipping over
1534
common HTML errors like unclosed tags. However, sometimes it makes
1535
errors of its own. For instance, consider this fragment:
1537
<b>Foo<b>Bar</b></b>
1539
This is perfectly valid (if bizarre) HTML. However, the
1540
BeautifulSoup class will implicitly close the first b tag when it
1541
encounters the second 'b'. It will think the author wrote
1542
"<b>Foo<b>Bar", and didn't close the first 'b' tag, because
1543
there's no real-world reason to bold something that's already
1544
bold. When it encounters '</b></b>' it will close two more 'b'
1545
tags, for a grand total of three tags closed instead of two. This
1546
can throw off the rest of your document structure. The same is
1547
true of a number of other tags, listed below.
1549
It's much more common for someone to forget to close a 'b' tag
1550
than to actually use nested 'b' tags, and the BeautifulSoup class
1551
handles the common case. This class handles the not-co-common
1552
case: where you can't believe someone wrote what they did, but
1553
it's valid HTML and BeautifulSoup screwed up by assuming it
1556
I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
1557
['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
1558
'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
1561
I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']
1563
NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
1564
I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
1565
I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
1567
class MinimalSoup(BeautifulSoup):
1568
"""The MinimalSoup class is for parsing HTML that contains
1569
pathologically bad markup. It makes no assumptions about tag
1570
nesting, but it does know which tags are self-closing, that
1571
<script> tags contain Javascript and should not be parsed, that
1572
META tags may contain encoding information, and so on.
1574
This also makes it better for subclassing than BeautifulStoneSoup
1575
or BeautifulSoup."""
1577
RESET_NESTING_TAGS = buildTagMap('noscript')
1580
class BeautifulSOAP(BeautifulStoneSoup):
1581
"""This class will push a tag with only a single string child into
1582
the tag's parent as an attribute. The attribute's name is the tag
1583
name, and the value is the string child. An example should give
1584
the flavor of the change:
1586
<foo><bar>baz</bar></foo>
1588
<foo bar="baz"><bar>baz</bar></foo>
1590
You can then access fooTag['bar'] instead of fooTag.barTag.string.
1592
This is, of course, useful for scraping structures that tend to
1593
use subelements instead of attributes, such as SOAP messages. Note
1594
that it modifies its input, so don't print the modified version
1597
I'm not sure how many people really want to use this class; let me
1598
know if you do. Mainly I like the name."""
1601
if len(self.tagStack) > 1:
1602
tag = self.tagStack[-1]
1603
parent = self.tagStack[-2]
1604
parent._getAttrMap()
1605
if (isinstance(tag, Tag) and len(tag.contents) == 1 and
1606
isinstance(tag.contents[0], NavigableString) and
1607
not parent.attrMap.has_key(tag.name)):
1608
parent[tag.name] = tag.contents[0]
1609
BeautifulStoneSoup.popTag(self)
1611
#Enterprise class names! It has come to our attention that some people
1612
#think the names of the Beautiful Soup parser classes are too silly
1613
#and "unprofessional" for use in enterprise screen-scraping. We feel
1614
#your pain! For such-minded folk, the Beautiful Soup Consortium And
1615
#All-Night Kosher Bakery recommends renaming this file to
1616
#"RobustParser.py" (or, in cases of extreme enterprisiness,
1617
#"RobustParserBeanInterface.class") and using the following
1618
#enterprise-friendly class aliases:
1619
class RobustXMLParser(BeautifulStoneSoup):
1621
class RobustHTMLParser(BeautifulSoup):
1623
class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
1625
class RobustInsanelyWackAssHTMLParser(MinimalSoup):
1627
class SimplifyingSOAPParser(BeautifulSOAP):
1630
######################################################
1632
# Bonus library: Unicode, Dammit
1634
# This class forces XML data into a standard format (usually to UTF-8
1635
# or Unicode). It is heavily based on code from Mark Pilgrim's
1636
# Universal Feed Parser. It does not rewrite the XML or HTML to
1637
# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
1638
# (XML) and BeautifulSoup.start_meta (HTML).
1640
# Autodetects character encodings.
1641
# Download from http://chardet.feedparser.org/
1642
import calibre.ebooks.chardet as chardet
1644
class UnicodeDammit:
1645
"""A class for detecting the encoding of a *ML document and
1646
converting it to a Unicode string. If the source encoding is
1647
windows-1252, can replace MS smart quotes with their HTML or XML
1650
# This dictionary maps commonly seen values for "charset" in HTML
1651
# meta tags to the corresponding Python codec names. It only covers
1652
# values that aren't in Python's aliases and can't be determined
1653
# by the heuristics in find_codec.
1654
CHARSET_ALIASES = { "macintosh" : "mac-roman",
1655
"x-sjis" : "shift-jis" }
1657
def __init__(self, markup, overrideEncodings=[],
1658
smartQuotesTo='xml'):
1659
self.markup, documentEncoding, sniffedEncoding = \
1660
self._detectEncoding(markup)
1661
self.smartQuotesTo = smartQuotesTo
1662
self.triedEncodings = []
1664
if markup == '' or isinstance(markup, unicode):
1665
self.originalEncoding = None
1666
self.unicode = unicode(markup)
1670
for proposedEncoding in overrideEncodings:
1671
u = self._convertFrom(proposedEncoding)
1674
for proposedEncoding in (documentEncoding, sniffedEncoding):
1675
u = self._convertFrom(proposedEncoding)
1678
# If no luck and we have auto-detection library, try that:
1679
if not u and chardet and not isinstance(self.markup, unicode):
1680
u = self._convertFrom(chardet.detect(self.markup)['encoding'])
1682
# As a last resort, try utf-8 and windows-1252:
1684
for proposed_encoding in ("utf-8", "windows-1252"):
1685
u = self._convertFrom(proposed_encoding)
1688
if not u: self.originalEncoding = None
1690
def _subMSChar(self, orig):
1691
"""Changes a MS smart quote character to an XML or HTML
1693
sub = self.MS_CHARS.get(orig)
1694
if type(sub) == types.TupleType:
1695
if self.smartQuotesTo == 'xml':
1696
sub = '&#x%s;' % sub[1]
1698
sub = '&%s;' % sub[0]
1701
def _convertFrom(self, proposed):
1702
proposed = self.find_codec(proposed)
1703
if not proposed or proposed in self.triedEncodings:
1705
self.triedEncodings.append(proposed)
1706
markup = self.markup
1708
# Convert smart quotes to HTML if coming from an encoding
1709
# that might have them.
1710
if self.smartQuotesTo and proposed.lower() in("windows-1252",
1713
markup = re.compile("([\x80-\x9f])").sub \
1714
(lambda(x): self._subMSChar(x.group(1)),
1718
# print "Trying to convert document to %s" % proposed
1719
u = self._toUnicode(markup, proposed)
1721
self.originalEncoding = proposed
1722
except Exception, e:
1723
#print "That didn't work!"
1726
#print "Correct encoding: %s" % proposed
1729
def _toUnicode(self, data, encoding):
1730
'''Given a string and its encoding, decodes the string into Unicode.
1731
%encoding is a string recognized by encodings.aliases'''
1733
# strip Byte Order Mark (if present)
1734
if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
1735
and (data[2:4] != '\x00\x00'):
1736
encoding = 'utf-16be'
1738
elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
1739
and (data[2:4] != '\x00\x00'):
1740
encoding = 'utf-16le'
1742
elif data[:3] == '\xef\xbb\xbf':
1745
elif data[:4] == '\x00\x00\xfe\xff':
1746
encoding = 'utf-32be'
1748
elif data[:4] == '\xff\xfe\x00\x00':
1749
encoding = 'utf-32le'
1752
newdata = unicode(data, encoding)
1756
def _detectEncoding(self, xml_data):
1757
"""Given a document, tries to detect its XML encoding."""
1758
xml_encoding = sniffed_xml_encoding = None
1760
if xml_data[:4] == '\x4c\x6f\xa7\x94':
1762
xml_data = self._ebcdic_to_ascii(xml_data)
1764
# By Kovid commented out all the recoding to UTF-8 of UTF-16 and UTF-32
1765
# as this doesn't make sense and doesn't work for the test case
1766
# BeautifulSoup.UnicodeDammit(u'abcd'.encode('utf-16')).unicode
1767
elif xml_data[:4] == '\x00\x3c\x00\x3f':
1769
sniffed_xml_encoding = 'utf-16be'
1770
#xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1771
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1772
and (xml_data[2:4] != '\x00\x00'):
1774
sniffed_xml_encoding = 'utf-16be'
1775
#xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1776
elif xml_data[:4] == '\x3c\x00\x3f\x00':
1778
sniffed_xml_encoding = 'utf-16le'
1779
#xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1780
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1781
(xml_data[2:4] != '\x00\x00'):
1783
sniffed_xml_encoding = 'utf-16le'
1784
#xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1785
elif xml_data[:4] == '\x00\x00\x00\x3c':
1787
sniffed_xml_encoding = 'utf-32be'
1788
#xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1789
elif xml_data[:4] == '\x3c\x00\x00\x00':
1791
sniffed_xml_encoding = 'utf-32le'
1792
#xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1793
elif xml_data[:4] == '\x00\x00\xfe\xff':
1795
sniffed_xml_encoding = 'utf-32be'
1796
#xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1797
elif xml_data[:4] == '\xff\xfe\x00\x00':
1799
sniffed_xml_encoding = 'utf-32le'
1800
#xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1801
elif xml_data[:3] == '\xef\xbb\xbf':
1803
sniffed_xml_encoding = 'utf-8'
1804
#xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1806
sniffed_xml_encoding = 'ascii'
1808
xml_encoding_match = re.compile \
1809
('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\
1811
if xml_encoding_match is None: # By Kovid to use the content-type header in HTML files
1812
xml_encoding_match = re.compile(r'<meta.*?content=[\'"].*?charset=(\S+).*?[\'"]', re.IGNORECASE).search(xml_data)
1814
xml_encoding_match = None
1815
if xml_encoding_match:
1816
xml_encoding = xml_encoding_match.groups()[0].lower()
1818
if sniffed_xml_encoding and \
1819
(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
1820
'iso-10646-ucs-4', 'ucs-4', 'csucs4',
1821
'utf-16', 'utf-32', 'utf_16', 'utf_32',
1823
xml_encoding = sniffed_xml_encoding
1825
return xml_data, xml_encoding, sniffed_xml_encoding
1828
def find_codec(self, charset):
1829
return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
1830
or (charset and self._codec(charset.replace("-", ""))) \
1831
or (charset and self._codec(charset.replace("-", "_"))) \
1834
def _codec(self, charset):
1835
if not charset: return charset
1838
codecs.lookup(charset)
1840
except (LookupError, ValueError):
1844
EBCDIC_TO_ASCII_MAP = None
1845
def _ebcdic_to_ascii(self, s):
1847
if not c.EBCDIC_TO_ASCII_MAP:
1848
emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
1849
16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
1850
128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
1851
144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
1852
32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
1853
38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
1854
45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
1855
186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
1856
195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
1857
201,202,106,107,108,109,110,111,112,113,114,203,204,205,
1858
206,207,208,209,126,115,116,117,118,119,120,121,122,210,
1859
211,212,213,214,215,216,217,218,219,220,221,222,223,224,
1860
225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
1861
73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
1862
82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
1863
90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
1864
250,251,252,253,254,255)
1866
c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
1867
''.join(map(chr, range(256))), ''.join(map(chr, emap)))
1868
return s.translate(c.EBCDIC_TO_ASCII_MAP)
1870
MS_CHARS = { '\x80' : ('euro', '20AC'),
1872
'\x82' : ('sbquo', '201A'),
1873
'\x83' : ('fnof', '192'),
1874
'\x84' : ('bdquo', '201E'),
1875
'\x85' : ('hellip', '2026'),
1876
'\x86' : ('dagger', '2020'),
1877
'\x87' : ('Dagger', '2021'),
1878
'\x88' : ('circ', '2C6'),
1879
'\x89' : ('permil', '2030'),
1880
'\x8A' : ('Scaron', '160'),
1881
'\x8B' : ('lsaquo', '2039'),
1882
'\x8C' : ('OElig', '152'),
1884
'\x8E' : ('#x17D', '17D'),
1887
'\x91' : ('lsquo', '2018'),
1888
'\x92' : ('rsquo', '2019'),
1889
'\x93' : ('ldquo', '201C'),
1890
'\x94' : ('rdquo', '201D'),
1891
'\x95' : ('bull', '2022'),
1892
'\x96' : ('ndash', '2013'),
1893
'\x97' : ('mdash', '2014'),
1894
'\x98' : ('tilde', '2DC'),
1895
'\x99' : ('trade', '2122'),
1896
'\x9a' : ('scaron', '161'),
1897
'\x9b' : ('rsaquo', '203A'),
1898
'\x9c' : ('oelig', '153'),
1900
'\x9e' : ('#x17E', '17E'),
1901
'\x9f' : ('Yuml', ''),}
1903
#######################################################################
1906
#By default, act as an HTML pretty-printer.
1907
if __name__ == '__main__':
1909
soup = BeautifulSoup(sys.stdin.read())
1910
print soup.prettify()