~ubuntu-branches/ubuntu/jaunty/beautifulsoup/jaunty : revision 9

11

structure. An ill-formed XML/HTML document yields a correspondingly

12

ill-formed data structure. If your document is only locally

13

well-formed, you can use this library to find and process the

14

well-formed part of it. The BeautifulSoup class

14

well-formed part of it.

15

16

Beautiful Soup works with Python 2.2 and up. It has no external

17

dependencies, but you'll have more success at converting data to UTF-8

24

http://cjkpython.i18n.org/

25

26

Beautiful Soup defines classes for two main parsing strategies:

27

28

* BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific

29

language that kind of looks like XML.

30

40

documentation:

41

http://www.crummy.com/software/BeautifulSoup/documentation.html

42

43

Here, have some legalese:

44

45

46

47

48

49

Redistribution and use in source and binary forms, with or without

50

modification, are permitted provided that the following conditions are

51

met:

52

53

* Redistributions of source code must retain the above copyright

54

notice, this list of conditions and the following disclaimer.

55

56

* Redistributions in binary form must reproduce the above

57

copyright notice, this list of conditions and the following

58

disclaimer in the documentation and/or other materials provided

59

with the distribution.

60

61

* Neither the name of the the Beautiful Soup Consortium and All

62

Night Kosher Bakery nor the names of its contributors may be

63

used to endorse or promote products derived from this software

64

without specific prior written permission.

65

66

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

67

"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

68

LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

69

A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR

70

CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

71

EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

72

PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

73

PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

74

LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

75

NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

76

SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.

77

43

78

"""

44

79

from __future__ import generators

45

80

46

81

__author__ = "Leonard Richardson (leonardr@segfault.org)"

47

__version__ = "3.0.4"

82

__version__ = "3.0.5"

48

83

49

__license__ = "PSF"

84

__license__ = "New-style BSD"

50

85

51

86

from sgmllib import SGMLParser, SGMLParseError

52

87

import codecs

71

106

72

107

def setup(self, parent=None, previous=None):

73

108

"""Sets up the initial relations between this element and

74

other elements."""

109

other elements."""

75

110

self.parent = parent

76

111

self.previous = previous

77

112

self.next = None

81

116

self.previousSibling = self.parent.contents[-1]

82

117

self.previousSibling.nextSibling = self

83

118

84

def replaceWith(self, replaceWith):

119

def replaceWith(self, replaceWith):

85

120

oldParent = self.parent

86

121

myIndex = self.parent.contents.index(self)

87

122

if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:

92

127

# means that when we extract it, the index of this

93

128

# element will change.

94

129

myIndex = myIndex - 1

95

self.extract()

130

self.extract()

96

131

oldParent.insert(myIndex, replaceWith)

97

132

98

133

def extract(self):

99

"""Destructively rips this element out of the tree."""

134

"""Destructively rips this element out of the tree."""

100

135

if self.parent:

101

136

try:

102

137

self.parent.contents.remove(self)

105

140

106

141

#Find the two elements that would be next to each other if

107

142

#this element (and any children) hadn't been parsed. Connect

108

#the two.

143

#the two.

109

144

lastChild = self._lastRecursiveChild()

110

145

nextElement = lastChild.next

111

146

116

151

self.previous = None

117

152

lastChild.next = None

118

153

119

self.parent = None

154

self.parent = None

120

155

if self.previousSibling:

121

156

self.previousSibling.nextSibling = self.nextSibling

122

157

if self.nextSibling:

123

158

self.nextSibling.previousSibling = self.previousSibling

124

self.previousSibling = self.nextSibling = None

159

self.previousSibling = self.nextSibling = None

125

160

126

161

def _lastRecursiveChild(self):

127

162

"Finds the last element beneath this object to be parsed."

134

169

if (isinstance(newChild, basestring)

135

170

or isinstance(newChild, unicode)) \

136

171

and not isinstance(newChild, NavigableString):

137

newChild = NavigableString(newChild)

172

newChild = NavigableString(newChild)

138

173

139

174

position = min(position, len(self.contents))

140

175

if hasattr(newChild, 'parent') and newChild.parent != None:

141

176

# We're 'inserting' an element that's already one

142

# of this object's children.

177

# of this object's children.

143

178

if newChild.parent == self:

144

179

index = self.find(newChild)

145

180

if index and index < position:

149

184

# will jump down one.

150

185

position = position - 1

151

186

newChild.extract()

152

187

153

188

newChild.parent = self

154

189

previousChild = None

155

190

if position == 0:

161

196

newChild.previousSibling.nextSibling = newChild

162

197

newChild.previous = previousChild._lastRecursiveChild()

163

198

if newChild.previous:

164

newChild.previous.next = newChild

199

newChild.previous.next = newChild

165

200

166

201

newChildsLastElement = newChild._lastRecursiveChild()

167

202

168

203

if position >= len(self.contents):

169

204

newChild.nextSibling = None

170

205

171

206

parent = self

172

207

parentsNextSibling = None

173

208

while not parentsNextSibling:

180

215

else:

181

216

newChildsLastElement.next = None

182

217

else:

183

nextChild = self.contents[position]

184

newChild.nextSibling = nextChild

218

nextChild = self.contents[position]

219

newChild.nextSibling = nextChild

185

220

if newChild.nextSibling:

186

221

newChild.nextSibling.previousSibling = newChild

187

222

newChildsLastElement.next = nextChild

190

225

newChildsLastElement.next.previous = newChildsLastElement

191

226

self.contents.insert(position, newChild)

192

227

228

def append(self, tag):

229

"""Appends the given tag to the contents of this tag."""

230

self.insert(len(self.contents), tag)

231

193

232

def findNext(self, name=None, attrs={}, text=None, **kwargs):

194

233

"""Returns the first item that matches the given criteria and

195

234

appears after this Tag in the document."""

269

308

if l:

270

309

r = l[0]

271

310

return r

272

311

273

312

def _findAll(self, name, attrs, text, limit, generator, **kwargs):

274

313

"Iterates over a generator looking for things that match."

275

314

294

333

return results

295

334

296

335

#These Generators can be used to navigate starting from both

297

#NavigableStrings and Tags.

336

#NavigableStrings and Tags.

298

337

def nextGenerator(self):

299

338

i = self

300

339

while i:

328

367

# Utility methods

329

368

def substituteEncoding(self, str, encoding=None):

330

369

encoding = encoding or "utf-8"

331

return str.replace("%SOUP-ENCODING%", encoding)

370

return str.replace("%SOUP-ENCODING%", encoding)

332

371

333

372

def toEncoding(self, s, encoding=None):

334

373

"""Encodes an object to a string in some encoding, or to Unicode.

350

389

351

390

class NavigableString(unicode, PageElement):

352

391

392

def __getnewargs__(self):

393

return (NavigableString.__str__(self),)

394

353

395

def __getattr__(self, attr):

354

396

"""text.string gives you text. This is for backwards

355

397

compatibility for Navigable*String, but for CData* it lets you

360

402

raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)

361

403

362

404

def __unicode__(self):

363

return self.__str__(None)

405

return unicode(str(self))

364

406

365

407

def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):

366

408

if encoding:

367

409

return self.encode(encoding)

368

410

else:

369

411

return self

370

412

371

413

class CData(NavigableString):

372

414

373

415

def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):

382

424

383

425

class Comment(NavigableString):

384

426

def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):

385

return "" % NavigableString.__str__(self, encoding)

427

return "" % NavigableString.__str__(self, encoding)

386

428

387

429

class Declaration(NavigableString):

388

430

def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):

389

return "<!%s>" % NavigableString.__str__(self, encoding)

431

return "<!%s>" % NavigableString.__str__(self, encoding)

390

432

391

433

class Tag(PageElement):

392

434

393

435

"""Represents a found HTML tag with its attributes and contents."""

394

436

395

XML_SPECIAL_CHARS_TO_ENTITIES = { "'" : "squot",

396

'"' : "quote",

397

"&" : "amp",

398

"<" : "lt",

399

">" : "gt" }

437

def _invert(h):

438

"Cheap function to invert a hash."

439

i = {}

440

for k,v in h.items():

441

i[v] = k

442

return i

443

444

XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",

445

"quot" : '"',

446

"amp" : "&",

447

"lt" : "<",

448

"gt" : ">" }

449

450

XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)

451

452

def _convertEntities(self, match):

453

"""Used in a call to re.sub to replace HTML, XML, and numeric

454

entities with the appropriate Unicode characters. If HTML

455

entities are being converted, any unrecognized entities are

456

escaped."""

457

x = match.group(1)

458

if self.convertHTMLEntities and x in name2codepoint:

459

return unichr(name2codepoint[x])

460

elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:

461

if self.convertXMLEntities:

462

return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]

463

else:

464

return u'&%s;' % x

465

elif len(x) > 0 and x[0] == '#':

466

# Handle numeric entities

467

if len(x) > 1 and x[1] == 'x':

468

return unichr(int(x[2:], 16))

469

else:

470

return unichr(int(x[1:]))

471

472

elif self.escapeUnrecognizedEntities:

473

return u'&%s;' % x

474

else:

475

return u'&%s;' % x

400

476

401

477

def __init__(self, parser, name, attrs=None, parent=None,

402

478

previous=None):

414

490

self.setup(parent, previous)

415

491

self.hidden = False

416

492

self.containsSubstitutions = False

493

self.convertHTMLEntities = parser.convertHTMLEntities

494

self.convertXMLEntities = parser.convertXMLEntities

495

self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities

496

497

# Convert any HTML, XML, or numeric entities in the attribute values.

498

convert = lambda(k, val): (k,

499

re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",

500

self._convertEntities,

501

val))

502

self.attrs = map(convert, self.attrs)

417

503

418

504

def get(self, key, default=None):

419

505

"""Returns the value of the 'key' attribute for the tag, or

420

506

the value given for 'default' if it doesn't have that

421

507

attribute."""

422

return self._getAttrMap().get(key, default)

508

return self._getAttrMap().get(key, default)

423

509

424

510

def has_key(self, key):

425

511

return self._getAttrMap().has_key(key)

444

530

"A tag is non-None even if it has no contents."

445

531

return True

446

532

447

def __setitem__(self, key, value):

533

def __setitem__(self, key, value):

448

534

"""Setting tag[key] sets the value of the 'key' attribute for the

449

535

tag."""

450

536

self._getAttrMap()

481

567

return self.find(tag[:-3])

482

568

elif tag.find('__') != 0:

483

569

return self.find(tag)

570

raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)

484

571

485

572

def __eq__(self, other):

486

573

"""Returns true iff this tag has the same name, the same attributes,

507

594

def __unicode__(self):

508

595

return self.__str__(None)

509

596

597

BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"

598

+ "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"

599

+ ")")

600

601

def _sub_entity(self, x):

602

"""Used with a regular expression to substitute the

603

appropriate XML entity for an XML special character."""

604

return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"

605

510

606

def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,

511

607

prettyPrint=False, indentLevel=0):

512

608

"""Returns a string or Unicode representation of this tag and

522

618

if self.attrs:

523

619

for key, val in self.attrs:

524

620

fmt = '%s="%s"'

525

if isString(val):

621

if isString(val):

526

622

if self.containsSubstitutions and '%SOUP-ENCODING%' in val:

527

623

val = self.substituteEncoding(val, encoding)

528

624

543

639

# embedded single quotes to XML entities.

544

640

if '"' in val:

545

641

fmt = "%s='%s'"

546

# This can't happen naturally, but it can happen

547

# if you modify an attribute value after parsing.

548

642

if "'" in val:

643

# TODO: replace with apos when

644

# appropriate.

549

645

val = val.replace("'", "&squot;")

550

646

551

647

# Now we're okay w/r/t quotes. But the attribute

552

648

# value might also contain angle brackets, or

553

649

# ampersands that aren't part of entities. We need

554

650

# to escape those to XML entities too.

555

val = re.sub("([<>]|&(?![^\s]+;))",

556

lambda x: "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";",

557

val)

558

651

val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)

652

559

653

attrs.append(fmt % (self.toEncoding(key, encoding),

560

654

self.toEncoding(val, encoding)))

561

655

close = ''

577

671

s = []

578

672

attributeString = ''

579

673

if attrs:

580

attributeString = ' ' + ' '.join(attrs)

674

attributeString = ' ' + ' '.join(attrs)

581

675

if prettyPrint:

582

676

s.append(space)

583

677

s.append('<%s%s%s>' % (encodedName, attributeString, close))

609

703

elif isinstance(c, Tag):

610

704

s.append(c.__str__(encoding, prettyPrint, indentLevel))

611

705

if text and prettyPrint:

612

text = text.strip()

706

text = text.strip()

613

707

if text:

614

708

if prettyPrint:

615

709

s.append(" " * (indentLevel-1))

616

710

s.append(text)

617

711

if prettyPrint:

618

712

s.append("\n")

619

return ''.join(s)

713

return ''.join(s)

620

714

621

715

#Soup methods

622

716

651

745

# Pre-3.x compatibility methods

652

746

first = find

653

747

fetch = findAll

654

748

655

749

def fetchText(self, text=None, recursive=True, limit=None):

656

750

return self.findAll(text=text, recursive=recursive, limit=limit)

657

751

658

752

def firstText(self, text=None, recursive=True):

659

753

return self.find(text=text, recursive=recursive)

660

661

#Utility methods

662

663

def append(self, tag):

664

"""Appends the given tag to the contents of this tag."""

665

self.contents.append(tag)

666

754

667

755

#Private methods

668

756

672

760

if not getattr(self, 'attrMap'):

673

761

self.attrMap = {}

674

762

for (key, value) in self.attrs:

675

self.attrMap[key] = value

763

self.attrMap[key] = value

676

764

return self.attrMap

677

765

678

766

#Generator methods

680

768

for i in range(0, len(self.contents)):

681

769

yield self.contents[i]

682

770

raise StopIteration

683

771

684

772

def recursiveChildGenerator(self):

685

773

stack = [(self, 0)]

686

774

while stack:

687

775

tag, start = stack.pop()

688

if isinstance(tag, Tag):

776

if isinstance(tag, Tag):

689

777

for i in range(start, len(tag.contents)):

690

778

a = tag.contents[i]

691

779

yield a

720

808

return self.text

721

809

else:

722

810

return "%s|%s" % (self.name, self.attrs)

723

811

724

812

def searchTag(self, markupName=None, markupAttrs={}):

725

813

found = None

726

814

markup = None

737

825

if callFunctionWithTagData:

738

826

match = self.name(markupName, markupAttrs)

739

827

else:

740

match = True

828

match = True

741

829

markupAttrMap = None

742

830

for attr, matchAgainst in self.attrs.items():

743

831

if not markupAttrMap:

762

850

#print 'looking for %s in %s' % (self, markup)

763

851

found = None

764

852

# If given a list of items, scan it for a text element that

765

# matches.

853

# matches.

766

854

if isList(markup) and not isinstance(markup, Tag):

767

855

for element in markup:

768

856

if isinstance(element, NavigableString) \

783

871

raise Exception, "I don't know how to match against a %s" \

784

872

% markup.__class__

785

873

return found

786

787

def _matches(self, markup, matchAgainst):

874

875

def _matches(self, markup, matchAgainst):

788

876

#print "Matching %s against %s" % (markup, matchAgainst)

789

877

result = False

790

878

if matchAgainst == True and type(matchAgainst) == types.BooleanType:

835

923

"""Convenience method that works with all 2.x versions of Python

836

924

to determine whether or not something is stringlike."""

837

925

try:

838

return isinstance(s, unicode) or isintance(s, basestring)

926

return isinstance(s, unicode) or isinstance(s, basestring)

839

927

except NameError:

840

928

return isinstance(s, str)

841

929

865

953

"""This class contains the basic parser and search code. It defines

866

954

a parser that knows nothing about tag behavior except for the

867

955

following:

868

956

869

957

You can't close a tag without closing all the tags it encloses.

870

958

That is, "<foo><bar></foo>" actually means

871

959

"<foo><bar></bar></foo>".

878

966

or when BeautifulSoup makes an assumption counter to what you were

879

967

expecting."""

880

968

881

XML_ENTITY_LIST = {}

882

for i in Tag.XML_SPECIAL_CHARS_TO_ENTITIES.values():

883

XML_ENTITY_LIST[i] = True

884

885

969

SELF_CLOSING_TAGS = {}

886

970

NESTABLE_TAGS = {}

887

971

RESET_NESTING_TAGS = {}

897

981

898

982

HTML_ENTITIES = "html"

899

983

XML_ENTITIES = "xml"

984

XHTML_ENTITIES = "xhtml"

985

# TODO: This only exists for backwards-compatibility

986

ALL_ENTITIES = XHTML_ENTITIES

987

988

# Used when determining whether a text node is all whitespace and

989

# can be replaced with a single space. A text node that contains

990

# fancy Unicode spaces (usually non-breaking) should be left

991

# alone.

992

STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }

900

993

901

994

def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,

902

995

markupMassage=True, smartQuotesTo=XML_ENTITIES,

903

996

convertEntities=None, selfClosingTags=None):

904

997

"""The Soup object is initialized as the 'root tag', and the

905

998

provided markup (which can be a string or a file-like object)

906

is fed into the underlying parser.

999

is fed into the underlying parser.

907

1000

908

1001

sgmllib will process most bad HTML, and the BeautifulSoup

909

1002

class has some tricks for dealing with some HTML that kills

930

1023

self.fromEncoding = fromEncoding

931

1024

self.smartQuotesTo = smartQuotesTo

932

1025

self.convertEntities = convertEntities

1026

# Set the rules for how we'll deal with the entities we

1027

# encounter

933

1028

if self.convertEntities:

934

1029

# It doesn't make sense to convert encoded characters to

935

1030

# entities even while you're converting entities to Unicode.

936

1031

# Just convert it all to Unicode.

937

1032

self.smartQuotesTo = None

1033

if convertEntities == self.HTML_ENTITIES:

1034

self.convertXMLEntities = False

1035

self.convertHTMLEntities = True

1036

self.escapeUnrecognizedEntities = True

1037

elif convertEntities == self.XHTML_ENTITIES:

1038

self.convertXMLEntities = True

1039

self.convertHTMLEntities = True

1040

self.escapeUnrecognizedEntities = False

1041

elif convertEntities == self.XML_ENTITIES:

1042

self.convertXMLEntities = True

1043

self.convertHTMLEntities = False

1044

self.escapeUnrecognizedEntities = False

1045

else:

1046

self.convertXMLEntities = False

1047

self.convertHTMLEntities = False

1048

self.escapeUnrecognizedEntities = False

1049

938

1050

self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)

939

1051

SGMLParser.__init__(self)

940

1052

941

1053

if hasattr(markup, 'read'): # It's a file-type object.

942

1054

markup = markup.read()

943

1055

self.markup = markup

947

1059

except StopParsing:

948

1060

pass

949

1061

self.markup = None # The markup can now be GCed

950

1062

1063

def convert_charref(self, name):

1064

"""This method fixes a bug in Python's SGMLParser."""

1065

try:

1066

n = int(name)

1067

except ValueError:

1068

return

1069

if not 0 <= n <= 127 : # ASCII ends at 127, not 255

1070

return

1071

return self.convert_codepoint(n)

1072

951

1073

def _feed(self, inDocumentEncoding=None):

952

1074

# Convert the document to Unicode.

953

1075

markup = self.markup

963

1085

if markup:

964

1086

if self.markupMassage:

965

1087

if not isList(self.markupMassage):

966

self.markupMassage = self.MARKUP_MASSAGE

1088

self.markupMassage = self.MARKUP_MASSAGE

967

1089

for fix, m in self.markupMassage:

968

1090

markup = fix.sub(m, markup)

1091

# TODO: We get rid of markupMassage so that the

1092

# soup object can be deepcopied later on. Some

1093

# Python installations can't copy regexes. If anyone

1094

# was relying on the existence of markupMassage, this

1095

# might cause problems.

1096

del(self.markupMassage)

969

1097

self.reset()

970

1098

971

1099

SGMLParser.feed(self, markup)

992

1120

self-closing tag according to this parser."""

993

1121

return self.SELF_CLOSING_TAGS.has_key(name) \

994

1122

or self.instanceSelfClosingTags.has_key(name)

995

1123

996

1124

def reset(self):

997

1125

Tag.__init__(self, self, self.ROOT_TAG_NAME)

998

1126

self.hidden = 1

1002

1130

self.tagStack = []

1003

1131

self.quoteStack = []

1004

1132

self.pushTag(self)

1005

1133

1006

1134

def popTag(self):

1007

1135

tag = self.tagStack.pop()

1008

1136

# Tags with just one string-owning child get the child as a

1020

1148

def pushTag(self, tag):

1021

1149

#print "Push", tag.name

1022

1150

if self.currentTag:

1023

self.currentTag.append(tag)

1151

self.currentTag.contents.append(tag)

1024

1152

self.tagStack.append(tag)

1025

1153

self.currentTag = self.tagStack[-1]

1026

1154

1027

1155

def endData(self, containerClass=NavigableString):

1028

1156

if self.currentData:

1029

1157

currentData = ''.join(self.currentData)

1030

if not currentData.strip():

1158

if not currentData.translate(self.STRIP_ASCII_SPACES):

1031

1159

if '\n' in currentData:

1032

1160

currentData = '\n'

1033

1161

else:

1052

1180

the given tag."""

1053

1181

#print "Popping to %s" % name

1054

1182

if name == self.ROOT_TAG_NAME:

1055

return

1183

return

1056

1184

1057

1185

numPops = 0

1058

1186

mostRecentTag = None

1065

1193

1066

1194

for i in range(0, numPops):

1067

1195

mostRecentTag = self.popTag()

1068

return mostRecentTag

1196

return mostRecentTag

1069

1197

1070

1198

def _smartPop(self, name):

1071

1199

1076

1204

comes between this tag and the previous tag of this type.

1077

1205

1078

1206

Examples:

1079

<p>Foo<b>Bar<p> should pop to 'p', not 'b'.

1080

<p>Foo<table>Bar<p> should pop to 'table', not 'p'.

1081

<p>Foo<table><tr>Bar<p> should pop to 'tr', not 'p'.

1082

<p>Foo<b>Bar<p> should pop to 'p', not 'b'.

1207

<p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.

1208

<p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.

1209

<p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.

1083

1210

1084

1211

<li><ul><li> *<li>* should pop to 'ul', not the first 'li'.

1085

1212

<tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'

1102

1229

and p.name in nestingResetTriggers) \

1103

1230

or (nestingResetTriggers == None and isResetNesting

1104

1231

and self.RESET_NESTING_TAGS.has_key(p.name)):

1105

1232

1106

1233

#If we encounter one of the nesting reset triggers

1107

1234

#peculiar to this tag, or we encounter another tag

1108

1235

#that causes nesting to reset, pop up to but not

1121

1248

#print "<%s> is not real!" % name

1122

1249

attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))

1123

1250

self.handle_data('<%s%s>' % (name, attrs))

1124

return

1251

return

1125

1252

self.endData()

1126

1253

1127

1254

if not self.isSelfClosingTag(name) and not selfClosing:

1137

1264

self.previous = tag

1138

1265

self.pushTag(tag)

1139

1266

if selfClosing or self.isSelfClosingTag(name):

1140

self.popTag()

1267

self.popTag()

1141

1268

if name in self.QUOTE_TAGS:

1142

1269

#print "Beginning quote (%s)" % name

1143

1270

self.quoteStack.append(name)

1172

1299

object, possibly one with a %SOUP-ENCODING% slot into which an

1173

1300

encoding will be plugged later."""

1174

1301

if text[:3] == "xml":

1175

text = "xml version='1.0' encoding='%SOUP-ENCODING%'"

1302

text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"

1176

1303

self._toStringSubclass(text, ProcessingInstruction)

1177

1304

1178

1305

def handle_comment(self, text):

1181

1308

1182

1309

def handle_charref(self, ref):

1183

1310

"Handle character references as data."

1184

if self.convertEntities in [self.HTML_ENTITIES,

1185

self.XML_ENTITIES]:

1311

if self.convertEntities:

1186

1312

data = unichr(int(ref))

1187

1313

else:

1188

1314

data = '&#%s;' % ref

1190

1316

1191

1317

def handle_entityref(self, ref):

1192

1318

"""Handle entity references as data, possibly converting known

1193

HTML entity references to the corresponding Unicode

1319

HTML and/or XML entity references to the corresponding Unicode

1194

1320

characters."""

1195

1321

data = None

1196

if self.convertEntities == self.HTML_ENTITIES or \

1197

(self.convertEntities == self.XML_ENTITIES and \

1198

self.XML_ENTITY_LIST.get(ref)):

1322

if self.convertHTMLEntities:

1199

1323

try:

1200

1324

data = unichr(name2codepoint[ref])

1201

1325

except KeyError:

1202

1326

pass

1327

1328

if not data and self.convertXMLEntities:

1329

data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)

1330

1331

if not data and self.convertHTMLEntities and \

1332

not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):

1333

# TODO: We've got a problem here. We're told this is

1334

# an entity reference, but it's not an XML entity

1335

# reference or an HTML entity reference. Nonetheless,

1336

# the logical thing to do is to pass it through as an

1337

# unrecognized entity reference.

1338

#

1339

# Except: when the input is "&carol;" this function

1340

# will be called with input "carol". When the input is

1341

# "AT&T", this function will be called with input

1342

# "T". We have no way of knowing whether a semicolon

1343

# was present originally, so we don't know whether

1344

# this is an unknown entity or just a misplaced

1345

# ampersand.

1346

#

1347

# The more common case is a misplaced ampersand, so I

1348

# escape the ampersand and omit the trailing semicolon.

1349

data = "&%s" % ref

1203

1350

if not data:

1204

data = '&%s;' % ref

1351

# This case is different from the one above, because we

1352

# haven't already gone through a supposedly comprehensive

1353

# mapping of entities to Unicode characters. We might not

1354

# have gone through any mapping at all. So the chances are

1355

# very high that this is a real entity, and not a

1356

# misplaced ampersand.

1357

data = "&%s;" % ref

1205

1358

self.handle_data(data)

1206

1359

1207

1360

def handle_decl(self, data):

1208

1361

"Handle DOCTYPEs and the like as Declaration objects."

1209

1362

self._toStringSubclass(data, Declaration)

1285

1438

['br' , 'hr', 'input', 'img', 'meta',

1286

1439

'spacer', 'link', 'frame', 'base'])

1287

1440

1288

QUOTE_TAGS = {'script': None}

1289

1441

QUOTE_TAGS = {'script' : None, 'textarea' : None}

1442

1290

1443

#According to the HTML standard, each of these inline tags can

1291

1444

#contain another tag of the same type. Furthermore, it's common

1292

1445

#to actually use these tags this way.

1298

1451

#to actually use these tags this way.

1299

1452

NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']

1300

1453

1301

#Lists can contain other lists, but there are restrictions.

1454

#Lists can contain other lists, but there are restrictions.

1302

1455

NESTABLE_LIST_TAGS = { 'ol' : [],

1303

1456

'ul' : [],

1304

1457

'li' : ['ul', 'ol'],

1306

1459

'dd' : ['dl'],

1307

1460

'dt' : ['dl'] }

1308

1461

1309

#Tables can contain other tables, but there are restrictions.

1310

NESTABLE_TABLE_TAGS = {'table' : [],

1462

#Tables can contain other tables, but there are restrictions.

1463

NESTABLE_TABLE_TAGS = {'table' : [],

1311

1464

'tr' : ['table', 'tbody', 'tfoot', 'thead'],

1312

1465

'td' : ['tr'],

1313

1466

'th' : ['tr'],

1377

1530

1378

1531

class StopParsing(Exception):

1379

1532

pass

1380

1533

1381

1534

class ICantBelieveItsBeautifulSoup(BeautifulSoup):

1382

1535

1383

1536

"""The BeautifulSoup class is oriented towards skipping over

1423

1576

1424

1577

This also makes it better for subclassing than BeautifulStoneSoup

1425

1578

or BeautifulSoup."""

1426

1579

1427

1580

RESET_NESTING_TAGS = buildTagMap('noscript')

1428

1581

NESTABLE_TAGS = {}

1429

1582

1453

1606

parent = self.tagStack[-2]

1454

1607

parent._getAttrMap()

1455

1608

if (isinstance(tag, Tag) and len(tag.contents) == 1 and

1456

isinstance(tag.contents[0], NavigableString) and

1609

isinstance(tag.contents[0], NavigableString) and

1457

1610

not parent.attrMap.has_key(tag.name)):

1458

1611

parent[tag.name] = tag.contents[0]

1459

1612

BeautifulStoneSoup.popTag(self)

1463

1616

#and "unprofessional" for use in enterprise screen-scraping. We feel

1464

1617

#your pain! For such-minded folk, the Beautiful Soup Consortium And

1465

1618

#All-Night Kosher Bakery recommends renaming this file to

1466

#"RobustParser.py" (or, in cases of extreme enterprisitude,

1619

#"RobustParser.py" (or, in cases of extreme enterprisiness,

1467

1620

#"RobustParserBeanInterface.class") and using the following

1468

1621

#enterprise-friendly class aliases:

1469

1622

class RobustXMLParser(BeautifulStoneSoup):

1495

1648

# chardet.constants._debug = 1

1496

1649

except:

1497

1650

chardet = None

1498

chardet = None

1499

1651

1500

1652

# cjkcodecs and iconv_codec make Python know about more character encodings.

1501

1653

# Both are available from http://cjkpython.i18n.org/

1530

1682

self.triedEncodings = []

1531

1683

if markup == '' or isinstance(markup, unicode):

1532

1684

self.originalEncoding = None

1533

self.unicode = unicode(markup)

1685

self.unicode = unicode(markup)

1534

1686

return

1535

1687

1536

1688

u = None

1537

1689

for proposedEncoding in overrideEncodings:

1538

1690

u = self._convertFrom(proposedEncoding)

1541

1693

for proposedEncoding in (documentEncoding, sniffedEncoding):

1542

1694

u = self._convertFrom(proposedEncoding)

1543

1695

if u: break

1544

1696

1545

1697

# If no luck and we have auto-detection library, try that:

1546

1698

if not u and chardet and not isinstance(self.markup, unicode):

1547

1699

u = self._convertFrom(chardet.detect(self.markup)['encoding'])

1551

1703

for proposed_encoding in ("utf-8", "windows-1252"):

1552

1704

u = self._convertFrom(proposed_encoding)

1553

1705

if u: break

1706

else:

1707

self.originalEncoding = None

1708

u = self.markup

1554

1709

self.unicode = u

1555

if not u: self.originalEncoding = None

1556

1710

1557

1711

def _subMSChar(self, orig):

1558

1712

"""Changes a MS smart quote character to an XML or HTML

1563

1717

sub = '&#x%s;' % sub[1]

1564

1718

else:

1565

1719

sub = '&%s;' % sub[0]

1566

return sub

1720

return sub

1567

1721

1568

def _convertFrom(self, proposed):

1722

def _convertFrom(self, proposed):

1569

1723

proposed = self.find_codec(proposed)

1570

1724

if not proposed or proposed in self.triedEncodings:

1571

1725

return None

1584

1738

try:

1585

1739

# print "Trying to convert document to %s" % proposed

1586

1740

u = self._toUnicode(markup, proposed)

1587

self.markup = u

1741

self.markup = u

1588

1742

self.originalEncoding = proposed

1589

1743

except Exception, e:

1590

1744

# print "That didn't work!"

1591

1745

# print e

1592

return None

1746

return None

1593

1747

#print "Correct encoding: %s" % proposed

1594

1748

return self.markup

1595

1749

1617

1771

data = data[4:]

1618

1772

newdata = unicode(data, encoding)

1619

1773

return newdata

1620

1774

1621

1775

def _detectEncoding(self, xml_data):

1622

1776

"""Given a document, tries to detect its XML encoding."""

1623

1777

xml_encoding = sniffed_xml_encoding = None

1689

1843

or charset

1690

1844

1691

1845

def _codec(self, charset):

1692

if not charset: return charset

1846

if not charset: return charset

1693

1847

codec = None

1694

1848

try:

1695

1849

codecs.lookup(charset)

1696

1850

codec = charset

1697

except LookupError:

1851

except (LookupError, ValueError):

1698

1852

pass

1699

1853

return codec

1700

1854