5
from bs4.dammit import EntitySubstitution
7
DEFAULT_OUTPUT_ENCODING = "utf-8"
8
PY3K = (sys.version_info[0] > 2)
10
whitespace_re = re.compile("\s+")
13
"""Alias one attribute name to another for backward compatibility"""
16
return getattr(self, attr)
20
return setattr(self, attr)
24
class NamespacedAttribute(unicode):
26
def __new__(cls, prefix, name, namespace=None):
28
obj = unicode.__new__(cls, prefix)
30
# Not really namespaced.
31
obj = unicode.__new__(cls, name)
33
obj = unicode.__new__(cls, prefix + ":" + name)
36
obj.namespace = namespace
39
class AttributeValueWithCharsetSubstitution(unicode):
40
"""A stand-in object for a character encoding specified in HTML."""
42
class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
43
"""A generic stand-in for the value of a meta tag's 'charset' attribute.
45
When Beautiful Soup parses the markup '<meta charset="utf8">', the
46
value of the 'charset' attribute will be one of these objects.
49
def __new__(cls, original_value):
50
obj = unicode.__new__(cls, original_value)
51
obj.original_value = original_value
54
def encode(self, encoding):
58
class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
59
"""A generic stand-in for the value of a meta tag's 'content' attribute.
61
When Beautiful Soup parses the markup:
62
<meta http-equiv="content-type" content="text/html; charset=utf8">
64
The value of the 'content' attribute will be one of these objects.
67
CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)
69
def __new__(cls, original_value):
70
match = cls.CHARSET_RE.search(original_value)
72
# No substitution necessary.
73
return unicode.__new__(unicode, original_value)
75
obj = unicode.__new__(cls, original_value)
76
obj.original_value = original_value
79
def encode(self, encoding):
81
return match.group(1) + encoding
82
return self.CHARSET_RE.sub(rewrite, self.original_value)
84
class HTMLAwareEntitySubstitution(EntitySubstitution):
86
"""Entity substitution rules that are aware of some HTML quirks.
88
Specifically, the contents of <script> and <style> tags should not
89
undergo entity substitution.
91
Incoming NavigableString objects are checked to see if they're the
92
direct children of a <script> or <style> tag.
95
cdata_containing_tags = set(["script", "style"])
97
preformatted_tags = set(["pre"])
100
def _substitute_if_appropriate(cls, ns, f):
101
if (isinstance(ns, NavigableString)
102
and ns.parent is not None
103
and ns.parent.name in cls.cdata_containing_tags):
110
def substitute_html(cls, ns):
111
return cls._substitute_if_appropriate(
112
ns, EntitySubstitution.substitute_html)
115
def substitute_xml(cls, ns):
116
return cls._substitute_if_appropriate(
117
ns, EntitySubstitution.substitute_xml)
119
class PageElement(object):
120
"""Contains the navigational information for some part of the page
121
(either a tag or a piece of text)"""
123
# There are five possible values for the "formatter" argument passed in
124
# to methods like encode() and prettify():
126
# "html" - All Unicode characters with corresponding HTML entities
127
# are converted to those entities on output.
128
# "minimal" - Bare ampersands and angle brackets are converted to
129
# XML entities: & < >
130
# None - The null formatter. Unicode characters are never
131
# converted to entities. This is not recommended, but it's
132
# faster than "minimal".
133
# A function - This function will be called on every string that
134
# needs to undergo entity substitution.
137
# In an HTML document, the default "html" and "minimal" functions
138
# will leave the contents of <script> and <style> tags alone. For
139
# an XML document, all tags will be given the same treatment.
142
"html" : HTMLAwareEntitySubstitution.substitute_html,
143
"minimal" : HTMLAwareEntitySubstitution.substitute_xml,
148
"html" : EntitySubstitution.substitute_html,
149
"minimal" : EntitySubstitution.substitute_xml,
153
def format_string(self, s, formatter='minimal'):
154
"""Format the given string using the given formatter."""
155
if not callable(formatter):
156
formatter = self._formatter_for_name(formatter)
157
if formatter is None:
160
output = formatter(s)
165
"""Is this element part of an XML tree or an HTML tree?
167
This is used when mapping a formatter name ("minimal") to an
168
appropriate function (one that performs entity-substitution on
169
the contents of <script> and <style> tags, or not). It's
170
inefficient, but it should be called very rarely.
172
if self.parent is None:
173
# This is the top-level object. It should have .is_xml set
174
# from tree creation. If not, take a guess--BS is usually
175
# used on HTML markup.
176
return getattr(self, 'is_xml', False)
177
return self.parent._is_xml
179
def _formatter_for_name(self, name):
180
"Look up a formatter function based on its name and the tree."
182
return self.XML_FORMATTERS.get(
183
name, EntitySubstitution.substitute_xml)
185
return self.HTML_FORMATTERS.get(
186
name, HTMLAwareEntitySubstitution.substitute_xml)
188
def setup(self, parent=None, previous_element=None):
189
"""Sets up the initial relations between this element and
192
self.previous_element = previous_element
193
if previous_element is not None:
194
self.previous_element.next_element = self
195
self.next_element = None
196
self.previous_sibling = None
197
self.next_sibling = None
198
if self.parent is not None and self.parent.contents:
199
self.previous_sibling = self.parent.contents[-1]
200
self.previous_sibling.next_sibling = self
202
nextSibling = _alias("next_sibling") # BS3
203
previousSibling = _alias("previous_sibling") # BS3
205
def replace_with(self, replace_with):
206
if replace_with is self:
208
if replace_with is self.parent:
209
raise ValueError("Cannot replace a Tag with its parent.")
210
old_parent = self.parent
211
my_index = self.parent.index(self)
213
old_parent.insert(my_index, replace_with)
215
replaceWith = replace_with # BS3
218
my_parent = self.parent
219
my_index = self.parent.index(self)
221
for child in reversed(self.contents[:]):
222
my_parent.insert(my_index, child)
224
replace_with_children = unwrap
225
replaceWithChildren = unwrap # BS3
227
def wrap(self, wrap_inside):
228
me = self.replace_with(wrap_inside)
229
wrap_inside.append(me)
233
"""Destructively rips this element out of the tree."""
234
if self.parent is not None:
235
del self.parent.contents[self.parent.index(self)]
237
#Find the two elements that would be next to each other if
238
#this element (and any children) hadn't been parsed. Connect
240
last_child = self._last_descendant()
241
next_element = last_child.next_element
243
if self.previous_element is not None:
244
self.previous_element.next_element = next_element
245
if next_element is not None:
246
next_element.previous_element = self.previous_element
247
self.previous_element = None
248
last_child.next_element = None
251
if self.previous_sibling is not None:
252
self.previous_sibling.next_sibling = self.next_sibling
253
if self.next_sibling is not None:
254
self.next_sibling.previous_sibling = self.previous_sibling
255
self.previous_sibling = self.next_sibling = None
258
def _last_descendant(self, is_initialized=True, accept_self=True):
259
"Finds the last element beneath this object to be parsed."
260
if is_initialized and self.next_sibling:
261
last_child = self.next_sibling.previous_element
264
while isinstance(last_child, Tag) and last_child.contents:
265
last_child = last_child.contents[-1]
266
if not accept_self and last_child == self:
269
# BS3: Not part of the API!
270
_lastRecursiveChild = _last_descendant
272
def insert(self, position, new_child):
273
if new_child is self:
274
raise ValueError("Cannot insert a tag into itself.")
275
if (isinstance(new_child, basestring)
276
and not isinstance(new_child, NavigableString)):
277
new_child = NavigableString(new_child)
279
position = min(position, len(self.contents))
280
if hasattr(new_child, 'parent') and new_child.parent is not None:
281
# We're 'inserting' an element that's already one
282
# of this object's children.
283
if new_child.parent is self:
284
current_index = self.index(new_child)
285
if current_index < position:
286
# We're moving this element further down the list
287
# of this object's children. That means that when
288
# we extract this element, our target index will
293
new_child.parent = self
294
previous_child = None
296
new_child.previous_sibling = None
297
new_child.previous_element = self
299
previous_child = self.contents[position - 1]
300
new_child.previous_sibling = previous_child
301
new_child.previous_sibling.next_sibling = new_child
302
new_child.previous_element = previous_child._last_descendant(False)
303
if new_child.previous_element is not None:
304
new_child.previous_element.next_element = new_child
306
new_childs_last_element = new_child._last_descendant(False)
308
if position >= len(self.contents):
309
new_child.next_sibling = None
312
parents_next_sibling = None
313
while parents_next_sibling is None and parent is not None:
314
parents_next_sibling = parent.next_sibling
315
parent = parent.parent
316
if parents_next_sibling is not None:
317
# We found the element that comes next in the document.
319
if parents_next_sibling is not None:
320
new_childs_last_element.next_element = parents_next_sibling
322
# The last element of this tag is the last element in
324
new_childs_last_element.next_element = None
326
next_child = self.contents[position]
327
new_child.next_sibling = next_child
328
if new_child.next_sibling is not None:
329
new_child.next_sibling.previous_sibling = new_child
330
new_childs_last_element.next_element = next_child
332
if new_childs_last_element.next_element is not None:
333
new_childs_last_element.next_element.previous_element = new_childs_last_element
334
self.contents.insert(position, new_child)
336
def append(self, tag):
337
"""Appends the given tag to the contents of this tag."""
338
self.insert(len(self.contents), tag)
340
def insert_before(self, predecessor):
341
"""Makes the given element the immediate predecessor of this one.
343
The two elements will have the same parent, and the given element
344
will be immediately before this one.
346
if self is predecessor:
347
raise ValueError("Can't insert an element before itself.")
351
"Element has no parent, so 'before' has no meaning.")
352
# Extract first so that the index won't be screwed up if they
354
if isinstance(predecessor, PageElement):
355
predecessor.extract()
356
index = parent.index(self)
357
parent.insert(index, predecessor)
359
def insert_after(self, successor):
360
"""Makes the given element the immediate successor of this one.
362
The two elements will have the same parent, and the given element
363
will be immediately after this one.
365
if self is successor:
366
raise ValueError("Can't insert an element after itself.")
370
"Element has no parent, so 'after' has no meaning.")
371
# Extract first so that the index won't be screwed up if they
373
if isinstance(successor, PageElement):
375
index = parent.index(self)
376
parent.insert(index+1, successor)
378
def find_next(self, name=None, attrs={}, text=None, **kwargs):
379
"""Returns the first item that matches the given criteria and
380
appears after this Tag in the document."""
381
return self._find_one(self.find_all_next, name, attrs, text, **kwargs)
382
findNext = find_next # BS3
384
def find_all_next(self, name=None, attrs={}, text=None, limit=None,
386
"""Returns all items that match the given criteria and appear
387
after this Tag in the document."""
388
return self._find_all(name, attrs, text, limit, self.next_elements,
390
findAllNext = find_all_next # BS3
392
def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
393
"""Returns the closest sibling to this Tag that matches the
394
given criteria and appears after this Tag in the document."""
395
return self._find_one(self.find_next_siblings, name, attrs, text,
397
findNextSibling = find_next_sibling # BS3
399
def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,
401
"""Returns the siblings of this Tag that match the given
402
criteria and appear after this Tag in the document."""
403
return self._find_all(name, attrs, text, limit,
404
self.next_siblings, **kwargs)
405
findNextSiblings = find_next_siblings # BS3
406
fetchNextSiblings = find_next_siblings # BS2
408
def find_previous(self, name=None, attrs={}, text=None, **kwargs):
409
"""Returns the first item that matches the given criteria and
410
appears before this Tag in the document."""
411
return self._find_one(
412
self.find_all_previous, name, attrs, text, **kwargs)
413
findPrevious = find_previous # BS3
415
def find_all_previous(self, name=None, attrs={}, text=None, limit=None,
417
"""Returns all items that match the given criteria and appear
418
before this Tag in the document."""
419
return self._find_all(name, attrs, text, limit, self.previous_elements,
421
findAllPrevious = find_all_previous # BS3
422
fetchPrevious = find_all_previous # BS2
424
def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
425
"""Returns the closest sibling to this Tag that matches the
426
given criteria and appears before this Tag in the document."""
427
return self._find_one(self.find_previous_siblings, name, attrs, text,
429
findPreviousSibling = find_previous_sibling # BS3
431
def find_previous_siblings(self, name=None, attrs={}, text=None,
432
limit=None, **kwargs):
433
"""Returns the siblings of this Tag that match the given
434
criteria and appear before this Tag in the document."""
435
return self._find_all(name, attrs, text, limit,
436
self.previous_siblings, **kwargs)
437
findPreviousSiblings = find_previous_siblings # BS3
438
fetchPreviousSiblings = find_previous_siblings # BS2
440
def find_parent(self, name=None, attrs={}, **kwargs):
441
"""Returns the closest parent of this Tag that matches the given
443
# NOTE: We can't use _find_one because findParents takes a different
446
l = self.find_parents(name, attrs, 1, **kwargs)
450
findParent = find_parent # BS3
452
def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
453
"""Returns the parents of this Tag that match the given
456
return self._find_all(name, attrs, None, limit, self.parents,
458
findParents = find_parents # BS3
459
fetchParents = find_parents # BS2
463
return self.next_element
467
return self.previous_element
469
#These methods do the real heavy lifting.
471
def _find_one(self, method, name, attrs, text, **kwargs):
473
l = method(name, attrs, text, 1, **kwargs)
478
def _find_all(self, name, attrs, text, limit, generator, **kwargs):
479
"Iterates over a generator looking for things that match."
481
if isinstance(name, SoupStrainer):
484
strainer = SoupStrainer(name, attrs, text, **kwargs)
486
if text is None and not limit and not attrs and not kwargs:
487
if name is True or name is None:
488
# Optimization to find all tags.
489
result = (element for element in generator
490
if isinstance(element, Tag))
491
return ResultSet(strainer, result)
492
elif isinstance(name, basestring):
493
# Optimization to find all tags with a given name.
494
result = (element for element in generator
495
if isinstance(element, Tag)
496
and element.name == name)
497
return ResultSet(strainer, result)
498
results = ResultSet(strainer)
502
except StopIteration:
505
found = strainer.search(i)
507
results.append(found)
508
if limit and len(results) >= limit:
512
#These generators can be used to navigate starting from both
513
#NavigableStrings and Tags.
515
def next_elements(self):
516
i = self.next_element
522
def next_siblings(self):
523
i = self.next_sibling
529
def previous_elements(self):
530
i = self.previous_element
533
i = i.previous_element
536
def previous_siblings(self):
537
i = self.previous_sibling
540
i = i.previous_sibling
549
# Methods for supporting CSS selectors.
551
tag_name_re = re.compile('^[a-z0-9]+$')
553
# /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/
554
# \---/ \---/\-------------/ \-------/
560
attribselect_re = re.compile(
561
r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' +
562
r'=?"?(?P<value>[^\]"]*)"?\]$'
565
def _attr_value_as_string(self, value, default=None):
566
"""Force an attribute value into a string representation.
568
A multi-valued attribute will be converted into a
569
space-separated stirng.
571
value = self.get(value, default)
572
if isinstance(value, list) or isinstance(value, tuple):
573
value =" ".join(value)
576
def _tag_name_matches_and(self, function, tag_name):
581
return tag.name == tag_name and function(tag)
584
def _attribute_checker(self, operator, attribute, value=''):
585
"""Create a function that performs a CSS selector operation.
587
Takes an operator, attribute and optional value. Returns a
588
function that will return True for elements that match that
592
# string representation of `attribute` is equal to `value`
593
return lambda el: el._attr_value_as_string(attribute) == value
594
elif operator == '~':
595
# space-separated list representation of `attribute`
597
def _includes_value(element):
598
attribute_value = element.get(attribute, [])
599
if not isinstance(attribute_value, list):
600
attribute_value = attribute_value.split()
601
return value in attribute_value
602
return _includes_value
603
elif operator == '^':
604
# string representation of `attribute` starts with `value`
605
return lambda el: el._attr_value_as_string(
606
attribute, '').startswith(value)
607
elif operator == '$':
608
# string represenation of `attribute` ends with `value`
609
return lambda el: el._attr_value_as_string(
610
attribute, '').endswith(value)
611
elif operator == '*':
612
# string representation of `attribute` contains `value`
613
return lambda el: value in el._attr_value_as_string(attribute, '')
614
elif operator == '|':
615
# string representation of `attribute` is either exactly
616
# `value` or starts with `value` and then a dash.
617
def _is_or_starts_with_dash(element):
618
attribute_value = element._attr_value_as_string(attribute, '')
619
return (attribute_value == value or attribute_value.startswith(
621
return _is_or_starts_with_dash
623
return lambda el: el.has_attr(attribute)
625
# Old non-property versions of the generators, for backwards
626
# compatibility with BS3.
627
def nextGenerator(self):
628
return self.next_elements
630
def nextSiblingGenerator(self):
631
return self.next_siblings
633
def previousGenerator(self):
634
return self.previous_elements
636
def previousSiblingGenerator(self):
637
return self.previous_siblings
639
def parentGenerator(self):
643
class NavigableString(unicode, PageElement):
648
def __new__(cls, value):
649
"""Create a new NavigableString.
651
When unpickling a NavigableString, this method is called with
652
the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
653
passed in to the superclass's __new__ or the superclass won't know
654
how to handle non-ASCII characters.
656
if isinstance(value, unicode):
657
return unicode.__new__(cls, value)
658
return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
663
def __getnewargs__(self):
664
return (unicode(self),)
666
def __getattr__(self, attr):
667
"""text.string gives you text. This is for backwards
668
compatibility for Navigable*String, but for CData* it lets you
669
get the string without the CData wrapper."""
673
raise AttributeError(
674
"'%s' object has no attribute '%s'" % (
675
self.__class__.__name__, attr))
677
def output_ready(self, formatter="minimal"):
678
output = self.format_string(self, formatter)
679
return self.PREFIX + output + self.SUFFIX
686
def name(self, name):
687
raise AttributeError("A NavigableString cannot be given a name.")
689
class PreformattedString(NavigableString):
690
"""A NavigableString not subject to the normal formatting rules.
692
The string will be passed into the formatter (to trigger side effects),
693
but the return value will be ignored.
696
def output_ready(self, formatter="minimal"):
697
"""CData strings are passed into the formatter.
698
But the return value is ignored."""
699
self.format_string(self, formatter)
700
return self.PREFIX + self + self.SUFFIX
702
class CData(PreformattedString):
704
PREFIX = u'<![CDATA['
707
class ProcessingInstruction(PreformattedString):
712
class Comment(PreformattedString):
718
class Declaration(PreformattedString):
723
class Doctype(PreformattedString):
726
def for_name_and_ids(cls, name, pub_id, system_id):
728
if pub_id is not None:
729
value += ' PUBLIC "%s"' % pub_id
730
if system_id is not None:
731
value += ' "%s"' % system_id
732
elif system_id is not None:
733
value += ' SYSTEM "%s"' % system_id
735
return Doctype(value)
737
PREFIX = u'<!DOCTYPE '
741
class Tag(PageElement):
743
"""Represents a found HTML tag with its attributes and contents."""
745
def __init__(self, parser=None, builder=None, name=None, namespace=None,
746
prefix=None, attrs=None, parent=None, previous=None):
750
self.parser_class = None
752
# We don't actually store the parser object: that lets extracted
753
# chunks be garbage-collected.
754
self.parser_class = parser.__class__
756
raise ValueError("No value provided for new tag's name.")
758
self.namespace = namespace
762
elif attrs and builder.cdata_list_attributes:
763
attrs = builder._replace_cdata_list_attribute_values(
769
self.setup(parent, previous)
772
# Set up any substitutions, such as the charset in a META tag.
773
if builder is not None:
774
builder.set_up_substitutions(self)
775
self.can_be_empty_element = builder.can_be_empty_element(name)
777
self.can_be_empty_element = False
779
parserClass = _alias("parser_class") # BS3
782
def is_empty_element(self):
783
"""Is this tag an empty-element tag? (aka a self-closing tag)
785
A tag that has contents is never an empty-element tag.
787
A tag that has no contents may or may not be an empty-element
788
tag. It depends on the builder used to create the tag. If the
789
builder has a designated list of empty-element tags, then only
790
a tag whose name shows up in that list is considered an
793
If the builder has no designated list of empty-element tags,
794
then any tag with no contents is an empty-element tag.
796
return len(self.contents) == 0 and self.can_be_empty_element
797
isSelfClosing = is_empty_element # BS3
801
"""Convenience property to get the single string within this tag.
803
:Return: If this tag has a single string child, return value
804
is that string. If this tag has no children, or more than one
805
child, return value is None. If this tag has one child tag,
806
return value is the 'string' attribute of the child tag,
809
if len(self.contents) != 1:
811
child = self.contents[0]
812
if isinstance(child, NavigableString):
817
def string(self, string):
819
self.append(string.__class__(string))
821
def _all_strings(self, strip=False, types=(NavigableString, CData)):
822
"""Yield all strings of certain classes, possibly stripping them.
824
By default, yields only NavigableString and CData objects. So
825
no comments, processing instructions, etc.
827
for descendant in self.descendants:
829
(types is None and not isinstance(descendant, NavigableString))
831
(types is not None and type(descendant) not in types)):
834
descendant = descendant.strip()
835
if len(descendant) == 0:
839
strings = property(_all_strings)
842
def stripped_strings(self):
843
for string in self._all_strings(True):
846
def get_text(self, separator=u"", strip=False,
847
types=(NavigableString, CData)):
849
Get all child strings, concatenated using the given separator.
851
return separator.join([s for s in self._all_strings(
852
strip, types=types)])
854
text = property(get_text)
857
"""Recursively destroys the contents of this tree."""
861
next = i.next_element
866
def clear(self, decompose=False):
868
Extract all children. If decompose is True, decompose instead.
871
for element in self.contents[:]:
872
if isinstance(element, Tag):
877
for element in self.contents[:]:
880
def index(self, element):
882
Find the index of a child by identity, not value. Avoids issues with
883
tag.contents.index(element) getting the index of equal elements.
885
for i, child in enumerate(self.contents):
888
raise ValueError("Tag.index: element not in tag")
890
def get(self, key, default=None):
891
"""Returns the value of the 'key' attribute for the tag, or
892
the value given for 'default' if it doesn't have that
894
return self.attrs.get(key, default)
896
def has_attr(self, key):
897
return key in self.attrs
900
return str(self).__hash__()
902
def __getitem__(self, key):
903
"""tag[key] returns the value of the 'key' attribute for the tag,
904
and throws an exception if it's not there."""
905
return self.attrs[key]
908
"Iterating over a tag iterates over its contents."
909
return iter(self.contents)
912
"The length of a tag is the length of its list of contents."
913
return len(self.contents)
915
def __contains__(self, x):
916
return x in self.contents
918
def __nonzero__(self):
919
"A tag is non-None even if it has no contents."
922
def __setitem__(self, key, value):
923
"""Setting tag[key] sets the value of the 'key' attribute for the
925
self.attrs[key] = value
927
def __delitem__(self, key):
928
"Deleting tag[key] deletes all 'key' attributes for the tag."
929
self.attrs.pop(key, None)
931
def __call__(self, *args, **kwargs):
932
"""Calling a tag like a function is the same as calling its
933
find_all() method. Eg. tag('a') returns a list of all the A tags
934
found within this tag."""
935
return self.find_all(*args, **kwargs)
937
def __getattr__(self, tag):
938
#print "Getattr %s.%s" % (self.__class__, tag)
939
if len(tag) > 3 and tag.endswith('Tag'):
940
# BS3: soup.aTag -> "soup.find("a")
943
'.%sTag is deprecated, use .find("%s") instead.' % (
945
return self.find(tag_name)
946
# We special case contents to avoid recursion.
947
elif not tag.startswith("__") and not tag=="contents":
948
return self.find(tag)
949
raise AttributeError(
950
"'%s' object has no attribute '%s'" % (self.__class__, tag))
952
def __eq__(self, other):
953
"""Returns true iff this tag has the same name, the same attributes,
954
and the same contents (recursively) as the given tag."""
957
if (not hasattr(other, 'name') or
958
not hasattr(other, 'attrs') or
959
not hasattr(other, 'contents') or
960
self.name != other.name or
961
self.attrs != other.attrs or
962
len(self) != len(other)):
964
for i, my_child in enumerate(self.contents):
965
if my_child != other.contents[i]:
969
def __ne__(self, other):
970
"""Returns true iff this tag is not identical to the other tag,
971
as defined in __eq__."""
972
return not self == other
974
def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
975
"""Renders this tag as a string."""
976
return self.encode(encoding)
978
def __unicode__(self):
985
__str__ = __repr__ = __unicode__
987
def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
988
indent_level=None, formatter="minimal",
989
errors="xmlcharrefreplace"):
990
# Turn the data structure into Unicode, then encode the
992
u = self.decode(indent_level, encoding, formatter)
993
return u.encode(encoding, errors)
995
def _should_pretty_print(self, indent_level):
996
"""Should this tag be pretty-printed?"""
998
indent_level is not None and
999
(self.name not in HTMLAwareEntitySubstitution.preformatted_tags
1002
def decode(self, indent_level=None,
1003
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1004
formatter="minimal"):
1005
"""Returns a Unicode representation of this tag and its contents.
1007
:param eventual_encoding: The tag is destined to be
1008
encoded into this encoding. This method is _not_
1009
responsible for performing that encoding. This information
1010
is passed in so that it can be substituted in if the
1011
document contains a <META> tag that mentions the document's
1015
# First off, turn a string formatter into a function. This
1016
# will stop the lookup from happening over and over again.
1017
if not callable(formatter):
1018
formatter = self._formatter_for_name(formatter)
1022
for key, val in sorted(self.attrs.items()):
1026
if isinstance(val, list) or isinstance(val, tuple):
1028
elif not isinstance(val, basestring):
1031
isinstance(val, AttributeValueWithCharsetSubstitution)
1032
and eventual_encoding is not None):
1033
val = val.encode(eventual_encoding)
1035
text = self.format_string(val, formatter)
1038
+ EntitySubstitution.quoted_attribute_value(text))
1039
attrs.append(decoded)
1045
prefix = self.prefix + ":"
1047
if self.is_empty_element:
1050
closeTag = '</%s%s>' % (prefix, self.name)
1052
pretty_print = self._should_pretty_print(indent_level)
1055
if indent_level is not None:
1056
indent_space = (' ' * (indent_level - 1))
1058
space = indent_space
1059
indent_contents = indent_level + 1
1061
indent_contents = None
1062
contents = self.decode_contents(
1063
indent_contents, eventual_encoding, formatter)
1066
# This is the 'document root' object.
1070
attribute_string = ''
1072
attribute_string = ' ' + ' '.join(attrs)
1073
if indent_level is not None:
1074
# Even if this particular tag is not pretty-printed,
1075
# we should indent up to the start of the tag.
1076
s.append(indent_space)
1077
s.append('<%s%s%s%s>' % (
1078
prefix, self.name, attribute_string, close))
1082
if pretty_print and contents and contents[-1] != "\n":
1084
if pretty_print and closeTag:
1087
if indent_level is not None and closeTag and self.next_sibling:
1088
# Even if this particular tag is not pretty-printed,
1089
# we're now done with the tag, and we should add a
1090
# newline if appropriate.
1095
def prettify(self, encoding=None, formatter="minimal"):
1096
if encoding is None:
1097
return self.decode(True, formatter=formatter)
1099
return self.encode(encoding, True, formatter=formatter)
1101
def decode_contents(self, indent_level=None,
1102
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
1103
formatter="minimal"):
1104
"""Renders the contents of this tag as a Unicode string.
1106
:param eventual_encoding: The tag is destined to be
1107
encoded into this encoding. This method is _not_
1108
responsible for performing that encoding. This information
1109
is passed in so that it can be substituted in if the
1110
document contains a <META> tag that mentions the document's
1113
# First off, turn a string formatter into a function. This
1114
# will stop the lookup from happening over and over again.
1115
if not callable(formatter):
1116
formatter = self._formatter_for_name(formatter)
1118
pretty_print = (indent_level is not None)
1122
if isinstance(c, NavigableString):
1123
text = c.output_ready(formatter)
1124
elif isinstance(c, Tag):
1125
s.append(c.decode(indent_level, eventual_encoding,
1127
if text and indent_level and not self.name == 'pre':
1130
if pretty_print and not self.name == 'pre':
1131
s.append(" " * (indent_level - 1))
1133
if pretty_print and not self.name == 'pre':
1137
def encode_contents(
1138
self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
1139
formatter="minimal"):
1140
"""Renders the contents of this tag as a bytestring."""
1141
contents = self.decode_contents(indent_level, encoding, formatter)
1142
return contents.encode(encoding)
1144
# Old method for BS3 compatibility
1145
def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
1146
prettyPrint=False, indentLevel=0):
1149
return self.encode_contents(
1150
indent_level=indentLevel, encoding=encoding)
1154
def find(self, name=None, attrs={}, recursive=True, text=None,
1156
"""Return only the first child of this Tag matching the given
1159
l = self.find_all(name, attrs, recursive, text, 1, **kwargs)
1165
def find_all(self, name=None, attrs={}, recursive=True, text=None,
1166
limit=None, **kwargs):
1167
"""Extracts a list of Tag objects that match the given
1168
criteria. You can specify the name of the Tag and any
1169
attributes you want the Tag to have.
1171
The value of a key-value pair in the 'attrs' map can be a
1172
string, a list of strings, a regular expression object, or a
1173
callable that takes a string and returns whether or not the
1174
string matches for some custom definition of 'matches'. The
1175
same is true of the tag name."""
1177
generator = self.descendants
1179
generator = self.children
1180
return self._find_all(name, attrs, text, limit, generator, **kwargs)
1181
findAll = find_all # BS3
1182
findChildren = find_all # BS2
1187
# return iter() to make the purpose of the method clear
1188
return iter(self.contents) # XXX This seems to be untested.
1191
def descendants(self):
1192
if not len(self.contents):
1194
stopNode = self._last_descendant().next_element
1195
current = self.contents[0]
1196
while current is not stopNode:
1198
current = current.next_element
1202
_selector_combinators = ['>', '+', '~']
1203
_select_debug = False
1204
def select(self, selector, _candidate_generator=None):
1205
"""Perform a CSS selection operation on the current element."""
1206
tokens = selector.split()
1207
current_context = [self]
1209
if tokens[-1] in self._selector_combinators:
1211
'Final combinator "%s" is missing an argument.' % tokens[-1])
1212
if self._select_debug:
1213
print 'Running CSS selector "%s"' % selector
1214
for index, token in enumerate(tokens):
1215
if self._select_debug:
1216
print ' Considering token "%s"' % token
1217
recursive_candidate_generator = None
1219
if tokens[index-1] in self._selector_combinators:
1220
# This token was consumed by the previous combinator. Skip it.
1221
if self._select_debug:
1222
print ' Token was consumed by the previous combinator.'
1224
# Each operation corresponds to a checker function, a rule
1225
# for determining whether a candidate matches the
1226
# selector. Candidates are generated by the active
1230
m = self.attribselect_re.match(token)
1232
# Attribute selector
1233
tag_name, attribute, operator, value = m.groups()
1234
checker = self._attribute_checker(operator, attribute, value)
1238
tag_name, tag_id = token.split('#', 1)
1239
def id_matches(tag):
1240
return tag.get('id', None) == tag_id
1241
checker = id_matches
1245
tag_name, klass = token.split('.', 1)
1246
classes = set(klass.split('.'))
1247
def classes_match(candidate):
1248
return classes.issubset(candidate.get('class', []))
1249
checker = classes_match
1253
tag_name, pseudo = token.split(':', 1)
1256
"A pseudo-class must be prefixed with a tag name.")
1257
pseudo_attributes = re.match('([a-zA-Z\d-]+)\(([a-zA-Z\d]+)\)', pseudo)
1259
if pseudo_attributes is not None:
1260
pseudo_type, pseudo_value = pseudo_attributes.groups()
1261
if pseudo_type == 'nth-of-type':
1263
pseudo_value = int(pseudo_value)
1265
raise NotImplementedError(
1266
'Only numeric values are currently supported for the nth-of-type pseudo-class.')
1267
if pseudo_value < 1:
1269
'nth-of-type pseudo-class value must be at least 1.')
1270
class Counter(object):
1271
def __init__(self, destination):
1273
self.destination = destination
1275
def nth_child_of_type(self, tag):
1277
if self.count == self.destination:
1279
if self.count > self.destination:
1280
# Stop the generator that's sending us
1282
raise StopIteration()
1284
checker = Counter(pseudo_value).nth_child_of_type
1286
raise NotImplementedError(
1287
'Only the following pseudo-classes are implemented: nth-of-type.')
1290
# Star selector -- matches everything
1293
# Run the next token as a CSS selector against the
1294
# direct children of each tag in the current context.
1295
recursive_candidate_generator = lambda tag: tag.children
1297
# Run the next token as a CSS selector against the
1298
# siblings of each tag in the current context.
1299
recursive_candidate_generator = lambda tag: tag.next_siblings
1301
# For each tag in the current context, run the next
1302
# token as a CSS selector against the tag's next
1303
# sibling that's a tag.
1304
def next_tag_sibling(tag):
1305
yield tag.find_next_sibling(True)
1306
recursive_candidate_generator = next_tag_sibling
1308
elif self.tag_name_re.match(token):
1313
'Unsupported or invalid CSS selector: "%s"' % token)
1315
if recursive_candidate_generator:
1316
# This happens when the selector looks like "> foo".
1318
# The generator calls select() recursively on every
1319
# member of the current context, passing in a different
1320
# candidate generator and a different selector.
1322
# In the case of "> foo", the candidate generator is
1323
# one that yields a tag's direct children (">"), and
1324
# the selector is "foo".
1325
next_token = tokens[index+1]
1326
def recursive_select(tag):
1327
if self._select_debug:
1328
print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
1330
for i in tag.select(next_token, recursive_candidate_generator):
1331
if self._select_debug:
1332
print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
1334
if self._select_debug:
1336
_use_candidate_generator = recursive_select
1337
elif _candidate_generator is None:
1338
# By default, a tag's candidates are all of its
1339
# children. If tag_name is defined, only yield tags
1341
if self._select_debug:
1346
print ' Default candidate generator, tag name="%s"' % check
1347
if self._select_debug:
1348
# This is redundant with later code, but it stops
1349
# a bunch of bogus tags from cluttering up the
1351
def default_candidate_generator(tag):
1352
for child in tag.descendants:
1353
if not isinstance(child, Tag):
1355
if tag_name and not child.name == tag_name:
1358
_use_candidate_generator = default_candidate_generator
1360
_use_candidate_generator = lambda tag: tag.descendants
1362
_use_candidate_generator = _candidate_generator
1365
new_context_ids = set([])
1366
for tag in current_context:
1367
if self._select_debug:
1368
print " Running candidate generator on %s %s" % (
1369
tag.name, repr(tag.attrs))
1370
for candidate in _use_candidate_generator(tag):
1371
if not isinstance(candidate, Tag):
1373
if tag_name and candidate.name != tag_name:
1375
if checker is not None:
1377
result = checker(candidate)
1378
except StopIteration:
1379
# The checker has decided we should no longer
1380
# run the generator.
1382
if checker is None or result:
1383
if self._select_debug:
1384
print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
1385
if id(candidate) not in new_context_ids:
1386
# If a tag matches a selector more than once,
1387
# don't include it in the context more than once.
1388
new_context.append(candidate)
1389
new_context_ids.add(id(candidate))
1390
elif self._select_debug:
1391
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
1393
current_context = new_context
1395
if self._select_debug:
1396
print "Final verdict:"
1397
for i in current_context:
1398
print " %s %s" % (i.name, i.attrs)
1399
return current_context
1401
# Old names for backwards compatibility
1402
def childGenerator(self):
1403
return self.children
1405
def recursiveChildGenerator(self):
1406
return self.descendants
1408
def has_key(self, key):
1409
"""This was kind of misleading because has_key() (attributes)
1410
was different from __in__ (contents). has_key() is gone in
1411
Python 3, anyway."""
1412
warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (
1414
return self.has_attr(key)
1416
# Next, a couple classes to represent queries and their results.
1417
class SoupStrainer(object):
1418
"""Encapsulates a number of ways of matching a markup element (tag or
1421
def __init__(self, name=None, attrs={}, text=None, **kwargs):
1422
self.name = self._normalize_search_value(name)
1423
if not isinstance(attrs, dict):
1424
# Treat a non-dict value for attrs as a search for the 'class'
1426
kwargs['class'] = attrs
1429
if 'class_' in kwargs:
1430
# Treat class_="foo" as a search for the 'class'
1431
# attribute, overriding any non-dict value for attrs.
1432
kwargs['class'] = kwargs['class_']
1433
del kwargs['class_']
1437
attrs = attrs.copy()
1438
attrs.update(kwargs)
1441
normalized_attrs = {}
1442
for key, value in attrs.items():
1443
normalized_attrs[key] = self._normalize_search_value(value)
1445
self.attrs = normalized_attrs
1446
self.text = self._normalize_search_value(text)
1448
def _normalize_search_value(self, value):
1449
# Leave it alone if it's a Unicode string, a callable, a
1450
# regular expression, a boolean, or None.
1451
if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match')
1452
or isinstance(value, bool) or value is None):
1455
# If it's a bytestring, convert it to Unicode, treating it as UTF-8.
1456
if isinstance(value, bytes):
1457
return value.decode("utf8")
1459
# If it's listlike, convert it into a list of strings.
1460
if hasattr(value, '__iter__'):
1463
if (hasattr(v, '__iter__') and not isinstance(v, bytes)
1464
and not isinstance(v, unicode)):
1465
# This is almost certainly the user's mistake. In the
1466
# interests of avoiding infinite loops, we'll let
1467
# it through as-is rather than doing a recursive call.
1470
new_value.append(self._normalize_search_value(v))
1473
# Otherwise, convert it into a Unicode string.
1474
# The unicode(str()) thing is so this will do the same thing on Python 2
1476
return unicode(str(value))
1482
return "%s|%s" % (self.name, self.attrs)
1484
def search_tag(self, markup_name=None, markup_attrs={}):
1487
if isinstance(markup_name, Tag):
1488
markup = markup_name
1489
markup_attrs = markup
1490
call_function_with_tag_data = (
1491
isinstance(self.name, collections.Callable)
1492
and not isinstance(markup_name, Tag))
1495
or call_function_with_tag_data
1496
or (markup and self._matches(markup, self.name))
1497
or (not markup and self._matches(markup_name, self.name))):
1498
if call_function_with_tag_data:
1499
match = self.name(markup_name, markup_attrs)
1502
markup_attr_map = None
1503
for attr, match_against in list(self.attrs.items()):
1504
if not markup_attr_map:
1505
if hasattr(markup_attrs, 'get'):
1506
markup_attr_map = markup_attrs
1508
markup_attr_map = {}
1509
for k, v in markup_attrs:
1510
markup_attr_map[k] = v
1511
attr_value = markup_attr_map.get(attr)
1512
if not self._matches(attr_value, match_against):
1520
if found and self.text and not self._matches(found.string, self.text):
1523
searchTag = search_tag
1525
def search(self, markup):
1526
# print 'looking for %s in %s' % (self, markup)
1528
# If given a list of items, scan it for a text element that
1530
if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)):
1531
for element in markup:
1532
if isinstance(element, NavigableString) \
1533
and self.search(element):
1536
# If it's a Tag, make sure its name or attributes match.
1537
# Don't bother with Tags if we're searching for text.
1538
elif isinstance(markup, Tag):
1539
if not self.text or self.name or self.attrs:
1540
found = self.search_tag(markup)
1541
# If it's text, make sure the text matches.
1542
elif isinstance(markup, NavigableString) or \
1543
isinstance(markup, basestring):
1544
if not self.name and not self.attrs and self._matches(markup, self.text):
1548
"I don't know how to match against a %s" % markup.__class__)
1551
def _matches(self, markup, match_against):
1552
# print u"Matching %s against %s" % (markup, match_against)
1554
if isinstance(markup, list) or isinstance(markup, tuple):
1555
# This should only happen when searching a multi-valued attribute
1557
if (isinstance(match_against, unicode)
1558
and ' ' in match_against):
1559
# A bit of a special case. If they try to match "foo
1560
# bar" on a multivalue attribute's value, only accept
1561
# the literal value "foo bar"
1563
# XXX This is going to be pretty slow because we keep
1564
# splitting match_against. But it shouldn't come up
1566
return (whitespace_re.split(match_against) == markup)
1569
if self._matches(item, match_against):
1573
if match_against is True:
1574
# True matches any non-None value.
1575
return markup is not None
1577
if isinstance(match_against, collections.Callable):
1578
return match_against(markup)
1580
# Custom callables take the tag as an argument, but all
1581
# other ways of matching match the tag name as a string.
1582
if isinstance(markup, Tag):
1583
markup = markup.name
1585
# Ensure that `markup` is either a Unicode string, or None.
1586
markup = self._normalize_search_value(markup)
1589
# None matches None, False, an empty string, an empty list, and so on.
1590
return not match_against
1592
if isinstance(match_against, unicode):
1593
# Exact string match
1594
return markup == match_against
1596
if hasattr(match_against, 'match'):
1598
return match_against.search(markup)
1600
if hasattr(match_against, '__iter__'):
1601
# The markup must be an exact match against something
1603
return markup in match_against
1606
class ResultSet(list):
1607
"""A ResultSet is just a list that keeps track of the SoupStrainer
1609
def __init__(self, source, result=()):
1610
super(ResultSet, self).__init__(result)
1611
self.source = source