~daker/ubuntu-html5-theme/i18n-pojson : revision 166

1

import collections

2

import re

3

import sys

4

import warnings

5

from bs4.dammit import EntitySubstitution

6

7

DEFAULT_OUTPUT_ENCODING = "utf-8"

8

PY3K = (sys.version_info[0] > 2)

9

10

whitespace_re = re.compile("\s+")

11

12

def _alias(attr):

13

"""Alias one attribute name to another for backward compatibility"""

14

@property

15

def alias(self):

16

return getattr(self, attr)

17

18

@alias.setter

19

def alias(self):

20

return setattr(self, attr)

21

return alias

22

23

24

class NamespacedAttribute(unicode):

25

26

def __new__(cls, prefix, name, namespace=None):

27

if name is None:

28

obj = unicode.__new__(cls, prefix)

29

elif prefix is None:

30

# Not really namespaced.

31

obj = unicode.__new__(cls, name)

32

else:

33

obj = unicode.__new__(cls, prefix + ":" + name)

34

obj.prefix = prefix

35

obj.name = name

36

obj.namespace = namespace

37

return obj

38

39

class AttributeValueWithCharsetSubstitution(unicode):

40

"""A stand-in object for a character encoding specified in HTML."""

41

42

class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):

43

"""A generic stand-in for the value of a meta tag's 'charset' attribute.

44

45

When Beautiful Soup parses the markup '<meta charset="utf8">', the

46

value of the 'charset' attribute will be one of these objects.

47

"""

48

49

def __new__(cls, original_value):

50

obj = unicode.__new__(cls, original_value)

51

obj.original_value = original_value

52

return obj

53

54

def encode(self, encoding):

55

return encoding

56

57

58

class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):

59

"""A generic stand-in for the value of a meta tag's 'content' attribute.

60

61

When Beautiful Soup parses the markup:

62

63

64

The value of the 'content' attribute will be one of these objects.

65

"""

66

67

CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M)

68

69

def __new__(cls, original_value):

70

match = cls.CHARSET_RE.search(original_value)

71

if match is None:

72

# No substitution necessary.

73

return unicode.__new__(unicode, original_value)

74

75

obj = unicode.__new__(cls, original_value)

76

obj.original_value = original_value

77

return obj

78

79

def encode(self, encoding):

80

def rewrite(match):

81

return match.group(1) + encoding

82

return self.CHARSET_RE.sub(rewrite, self.original_value)

83

84

class HTMLAwareEntitySubstitution(EntitySubstitution):

85

86

"""Entity substitution rules that are aware of some HTML quirks.

87

88

Specifically, the contents of <script> and <style> tags should not

89

undergo entity substitution.

90

91

Incoming NavigableString objects are checked to see if they're the

92

direct children of a <script> or <style> tag.

93

"""

94

95

cdata_containing_tags = set(["script", "style"])

96

97

preformatted_tags = set(["pre"])

98

99

@classmethod

100

def _substitute_if_appropriate(cls, ns, f):

101

if (isinstance(ns, NavigableString)

102

and ns.parent is not None

103

and ns.parent.name in cls.cdata_containing_tags):

104

# Do nothing.

105

return ns

106

# Substitute.

107

return f(ns)

108

109

@classmethod

110

def substitute_html(cls, ns):

111

return cls._substitute_if_appropriate(

112

ns, EntitySubstitution.substitute_html)

113

114

@classmethod

115

def substitute_xml(cls, ns):

116

return cls._substitute_if_appropriate(

117

ns, EntitySubstitution.substitute_xml)

118

119

class PageElement(object):

120

"""Contains the navigational information for some part of the page

121

(either a tag or a piece of text)"""

122

123

# There are five possible values for the "formatter" argument passed in

124

# to methods like encode() and prettify():

125

#

126

# "html" - All Unicode characters with corresponding HTML entities

127

# are converted to those entities on output.

128

# "minimal" - Bare ampersands and angle brackets are converted to

129

# XML entities: & < >

130

# None - The null formatter. Unicode characters are never

131

# converted to entities. This is not recommended, but it's

132

# faster than "minimal".

133

# A function - This function will be called on every string that

134

# needs to undergo entity substitution.

135

#

136

137

# In an HTML document, the default "html" and "minimal" functions

138

# will leave the contents of <script> and <style> tags alone. For

139

# an XML document, all tags will be given the same treatment.

140

141

HTML_FORMATTERS = {

142

"html" : HTMLAwareEntitySubstitution.substitute_html,

143

"minimal" : HTMLAwareEntitySubstitution.substitute_xml,

144

None : None

145

}

146

147

XML_FORMATTERS = {

148

"html" : EntitySubstitution.substitute_html,

149

"minimal" : EntitySubstitution.substitute_xml,

150

None : None

151

}

152

153

def format_string(self, s, formatter='minimal'):

154

"""Format the given string using the given formatter."""

155

if not callable(formatter):

156

formatter = self._formatter_for_name(formatter)

157

if formatter is None:

158

output = s

159

else:

160

output = formatter(s)

161

return output

162

163

@property

164

def _is_xml(self):

165

"""Is this element part of an XML tree or an HTML tree?

166

167

This is used when mapping a formatter name ("minimal") to an

168

appropriate function (one that performs entity-substitution on

169

the contents of <script> and <style> tags, or not). It's

170

inefficient, but it should be called very rarely.

171

"""

172

if self.parent is None:

173

# This is the top-level object. It should have .is_xml set

174

# from tree creation. If not, take a guess--BS is usually

175

# used on HTML markup.

176

return getattr(self, 'is_xml', False)

177

return self.parent._is_xml

178

179

def _formatter_for_name(self, name):

180

"Look up a formatter function based on its name and the tree."

181

if self._is_xml:

182

return self.XML_FORMATTERS.get(

183

name, EntitySubstitution.substitute_xml)

184

else:

185

return self.HTML_FORMATTERS.get(

186

name, HTMLAwareEntitySubstitution.substitute_xml)

187

188

def setup(self, parent=None, previous_element=None):

189

"""Sets up the initial relations between this element and

190

other elements."""

191

self.parent = parent

192

self.previous_element = previous_element

193

if previous_element is not None:

194

self.previous_element.next_element = self

195

self.next_element = None

196

self.previous_sibling = None

197

self.next_sibling = None

198

if self.parent is not None and self.parent.contents:

199

self.previous_sibling = self.parent.contents[-1]

200

self.previous_sibling.next_sibling = self

201

202

nextSibling = _alias("next_sibling") # BS3

203

previousSibling = _alias("previous_sibling") # BS3

204

205

def replace_with(self, replace_with):

206

if replace_with is self:

207

return

208

if replace_with is self.parent:

209

raise ValueError("Cannot replace a Tag with its parent.")

210

old_parent = self.parent

211

my_index = self.parent.index(self)

212

self.extract()

213

old_parent.insert(my_index, replace_with)

214

return self

215

replaceWith = replace_with # BS3

216

217

def unwrap(self):

218

my_parent = self.parent

219

my_index = self.parent.index(self)

220

self.extract()

221

for child in reversed(self.contents[:]):

222

my_parent.insert(my_index, child)

223

return self

224

replace_with_children = unwrap

225

replaceWithChildren = unwrap # BS3

226

227

def wrap(self, wrap_inside):

228

me = self.replace_with(wrap_inside)

229

wrap_inside.append(me)

230

return wrap_inside

231

232

def extract(self):

233

"""Destructively rips this element out of the tree."""

234

if self.parent is not None:

235

del self.parent.contents[self.parent.index(self)]

236

237

#Find the two elements that would be next to each other if

238

#this element (and any children) hadn't been parsed. Connect

239

#the two.

240

last_child = self._last_descendant()

241

next_element = last_child.next_element

242

243

if self.previous_element is not None:

244

self.previous_element.next_element = next_element

245

if next_element is not None:

246

next_element.previous_element = self.previous_element

247

self.previous_element = None

248

last_child.next_element = None

249

250

self.parent = None

251

if self.previous_sibling is not None:

252

self.previous_sibling.next_sibling = self.next_sibling

253

if self.next_sibling is not None:

254

self.next_sibling.previous_sibling = self.previous_sibling

255

self.previous_sibling = self.next_sibling = None

256

return self

257

258

def _last_descendant(self, is_initialized=True, accept_self=True):

259

"Finds the last element beneath this object to be parsed."

260

if is_initialized and self.next_sibling:

261

last_child = self.next_sibling.previous_element

262

else:

263

last_child = self

264

while isinstance(last_child, Tag) and last_child.contents:

265

last_child = last_child.contents[-1]

266

if not accept_self and last_child == self:

267

last_child = None

268

return last_child

269

# BS3: Not part of the API!

270

_lastRecursiveChild = _last_descendant

271

272

def insert(self, position, new_child):

273

if new_child is self:

274

raise ValueError("Cannot insert a tag into itself.")

275

if (isinstance(new_child, basestring)

276

and not isinstance(new_child, NavigableString)):

277

new_child = NavigableString(new_child)

278

279

position = min(position, len(self.contents))

280

if hasattr(new_child, 'parent') and new_child.parent is not None:

281

# We're 'inserting' an element that's already one

282

# of this object's children.

283

if new_child.parent is self:

284

current_index = self.index(new_child)

285

if current_index < position:

286

# We're moving this element further down the list

287

# of this object's children. That means that when

288

# we extract this element, our target index will

289

# jump down one.

290

position -= 1

291

new_child.extract()

292

293

new_child.parent = self

294

previous_child = None

295

if position == 0:

296

new_child.previous_sibling = None

297

new_child.previous_element = self

298

else:

299

previous_child = self.contents[position - 1]

300

new_child.previous_sibling = previous_child

301

new_child.previous_sibling.next_sibling = new_child

302

new_child.previous_element = previous_child._last_descendant(False)

303

if new_child.previous_element is not None:

304

new_child.previous_element.next_element = new_child

305

306

new_childs_last_element = new_child._last_descendant(False)

307

308

if position >= len(self.contents):

309

new_child.next_sibling = None

310

311

parent = self

312

parents_next_sibling = None

313

while parents_next_sibling is None and parent is not None:

314

parents_next_sibling = parent.next_sibling

315

parent = parent.parent

316

if parents_next_sibling is not None:

317

# We found the element that comes next in the document.

318

break

319

if parents_next_sibling is not None:

320

new_childs_last_element.next_element = parents_next_sibling

321

else:

322

# The last element of this tag is the last element in

323

# the document.

324

new_childs_last_element.next_element = None

325

else:

326

next_child = self.contents[position]

327

new_child.next_sibling = next_child

328

if new_child.next_sibling is not None:

329

new_child.next_sibling.previous_sibling = new_child

330

new_childs_last_element.next_element = next_child

331

332

if new_childs_last_element.next_element is not None:

333

new_childs_last_element.next_element.previous_element = new_childs_last_element

334

self.contents.insert(position, new_child)

335

336

def append(self, tag):

337

"""Appends the given tag to the contents of this tag."""

338

self.insert(len(self.contents), tag)

339

340

def insert_before(self, predecessor):

341

"""Makes the given element the immediate predecessor of this one.

342

343

The two elements will have the same parent, and the given element

344

will be immediately before this one.

345

"""

346

if self is predecessor:

347

raise ValueError("Can't insert an element before itself.")

348

parent = self.parent

349

if parent is None:

350

raise ValueError(

351

"Element has no parent, so 'before' has no meaning.")

352

# Extract first so that the index won't be screwed up if they

353

# are siblings.

354

if isinstance(predecessor, PageElement):

355

predecessor.extract()

356

index = parent.index(self)

357

parent.insert(index, predecessor)

358

359

def insert_after(self, successor):

360

"""Makes the given element the immediate successor of this one.

361

362

The two elements will have the same parent, and the given element

363

will be immediately after this one.

364

"""

365

if self is successor:

366

raise ValueError("Can't insert an element after itself.")

367

parent = self.parent

368

if parent is None:

369

raise ValueError(

370

"Element has no parent, so 'after' has no meaning.")

371

# Extract first so that the index won't be screwed up if they

372

# are siblings.

373

if isinstance(successor, PageElement):

374

successor.extract()

375

index = parent.index(self)

376

parent.insert(index+1, successor)

377

378

def find_next(self, name=None, attrs={}, text=None, **kwargs):

379

"""Returns the first item that matches the given criteria and

380

appears after this Tag in the document."""

381

return self._find_one(self.find_all_next, name, attrs, text, **kwargs)

382

findNext = find_next # BS3

383

384

def find_all_next(self, name=None, attrs={}, text=None, limit=None,

385

**kwargs):

386

"""Returns all items that match the given criteria and appear

387

after this Tag in the document."""

388

return self._find_all(name, attrs, text, limit, self.next_elements,

389

**kwargs)

390

findAllNext = find_all_next # BS3

391

392

def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):

393

"""Returns the closest sibling to this Tag that matches the

394

given criteria and appears after this Tag in the document."""

395

return self._find_one(self.find_next_siblings, name, attrs, text,

396

**kwargs)

397

findNextSibling = find_next_sibling # BS3

398

399

def find_next_siblings(self, name=None, attrs={}, text=None, limit=None,

400

**kwargs):

401

"""Returns the siblings of this Tag that match the given

402

criteria and appear after this Tag in the document."""

403

return self._find_all(name, attrs, text, limit,

404

self.next_siblings, **kwargs)

405

findNextSiblings = find_next_siblings # BS3

406

fetchNextSiblings = find_next_siblings # BS2

407

408

def find_previous(self, name=None, attrs={}, text=None, **kwargs):

409

"""Returns the first item that matches the given criteria and

410

appears before this Tag in the document."""

411

return self._find_one(

412

self.find_all_previous, name, attrs, text, **kwargs)

413

findPrevious = find_previous # BS3

414

415

def find_all_previous(self, name=None, attrs={}, text=None, limit=None,

416

**kwargs):

417

"""Returns all items that match the given criteria and appear

418

before this Tag in the document."""

419

return self._find_all(name, attrs, text, limit, self.previous_elements,

420

**kwargs)

421

findAllPrevious = find_all_previous # BS3

422

fetchPrevious = find_all_previous # BS2

423

424

def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):

425

"""Returns the closest sibling to this Tag that matches the

426

given criteria and appears before this Tag in the document."""

427

return self._find_one(self.find_previous_siblings, name, attrs, text,

428

**kwargs)

429

findPreviousSibling = find_previous_sibling # BS3

430

431

def find_previous_siblings(self, name=None, attrs={}, text=None,

432

limit=None, **kwargs):

433

"""Returns the siblings of this Tag that match the given

434

criteria and appear before this Tag in the document."""

435

return self._find_all(name, attrs, text, limit,

436

self.previous_siblings, **kwargs)

437

findPreviousSiblings = find_previous_siblings # BS3

438

fetchPreviousSiblings = find_previous_siblings # BS2

439

440

def find_parent(self, name=None, attrs={}, **kwargs):

441

"""Returns the closest parent of this Tag that matches the given

442

criteria."""

443

# NOTE: We can't use _find_one because findParents takes a different

444

# set of arguments.

445

r = None

446

l = self.find_parents(name, attrs, 1, **kwargs)

447

if l:

448

r = l[0]

449

return r

450

findParent = find_parent # BS3

451

452

def find_parents(self, name=None, attrs={}, limit=None, **kwargs):

453

"""Returns the parents of this Tag that match the given

454

criteria."""

455

456

return self._find_all(name, attrs, None, limit, self.parents,

457

**kwargs)

458

findParents = find_parents # BS3

459

fetchParents = find_parents # BS2

460

461

@property

462

def next(self):

463

return self.next_element

464

465

@property

466

def previous(self):

467

return self.previous_element

468

469

#These methods do the real heavy lifting.

470

471

def _find_one(self, method, name, attrs, text, **kwargs):

472

r = None

473

l = method(name, attrs, text, 1, **kwargs)

474

if l:

475

r = l[0]

476

return r

477

478

def _find_all(self, name, attrs, text, limit, generator, **kwargs):

479

"Iterates over a generator looking for things that match."

480

481

if isinstance(name, SoupStrainer):

482

strainer = name

483

else:

484

strainer = SoupStrainer(name, attrs, text, **kwargs)

485

486

if text is None and not limit and not attrs and not kwargs:

487

if name is True or name is None:

488

# Optimization to find all tags.

489

result = (element for element in generator

490

if isinstance(element, Tag))

491

return ResultSet(strainer, result)

492

elif isinstance(name, basestring):

493

# Optimization to find all tags with a given name.

494

result = (element for element in generator

495

if isinstance(element, Tag)

496

and element.name == name)

497

return ResultSet(strainer, result)

498

results = ResultSet(strainer)

499

while True:

500

try:

501

i = next(generator)

502

except StopIteration:

503

break

504

if i:

505

found = strainer.search(i)

506

if found:

507

results.append(found)

508

if limit and len(results) >= limit:

509

break

510

return results

511

512

#These generators can be used to navigate starting from both

513

#NavigableStrings and Tags.

514

@property

515

def next_elements(self):

516

i = self.next_element

517

while i is not None:

518

yield i

519

i = i.next_element

520

521

@property

522

def next_siblings(self):

523

i = self.next_sibling

524

while i is not None:

525

yield i

526

i = i.next_sibling

527

528

@property

529

def previous_elements(self):

530

i = self.previous_element

531

while i is not None:

532

yield i

533

i = i.previous_element

534

535

@property

536

def previous_siblings(self):

537

i = self.previous_sibling

538

while i is not None:

539

yield i

540

i = i.previous_sibling

541

542

@property

543

def parents(self):

544

i = self.parent

545

while i is not None:

546

yield i

547

i = i.parent

548

549

# Methods for supporting CSS selectors.

550

551

tag_name_re = re.compile('^[a-z0-9]+$')

552

553

# /^(\w+)\[(\w+)([=~\|\^\$\*]?)=?"?([^\]"]*)"?\]$/

554

# \---/ \---/\-------------/ \-------/

555

# | | | |

556

# | | | The value

557

# | | ~,|,^,$,* or =

558

# | Attribute

559

# Tag

560

attribselect_re = re.compile(

561

r'^(?P<tag>\w+)?\[(?P<attribute>\w+)(?P<operator>[=~\|\^\$\*]?)' +

562

r'=?"?(?P<value>[^\]"]*)"?\]$'

563

)

564

565

def _attr_value_as_string(self, value, default=None):

566

"""Force an attribute value into a string representation.

567

568

A multi-valued attribute will be converted into a

569

space-separated stirng.

570

"""

571

value = self.get(value, default)

572

if isinstance(value, list) or isinstance(value, tuple):

573

value =" ".join(value)

574

return value

575

576

def _tag_name_matches_and(self, function, tag_name):

577

if not tag_name:

578

return function

579

else:

580

def _match(tag):

581

return tag.name == tag_name and function(tag)

582

return _match

583

584

def _attribute_checker(self, operator, attribute, value=''):

585

"""Create a function that performs a CSS selector operation.

586

587

Takes an operator, attribute and optional value. Returns a

588

function that will return True for elements that match that

589

combination.

590

"""

591

if operator == '=':

592

# string representation of `attribute` is equal to `value`

593

return lambda el: el._attr_value_as_string(attribute) == value

594

elif operator == '~':

595

# space-separated list representation of `attribute`

596

# contains `value`

597

def _includes_value(element):

598

attribute_value = element.get(attribute, [])

599

if not isinstance(attribute_value, list):

600

attribute_value = attribute_value.split()

601

return value in attribute_value

602

return _includes_value

603

elif operator == '^':

604

# string representation of `attribute` starts with `value`

605

return lambda el: el._attr_value_as_string(

606

attribute, '').startswith(value)

607

elif operator == '$':

608

# string represenation of `attribute` ends with `value`

609

return lambda el: el._attr_value_as_string(

610

attribute, '').endswith(value)

611

elif operator == '*':

612

# string representation of `attribute` contains `value`

613

return lambda el: value in el._attr_value_as_string(attribute, '')

614

elif operator == '|':

615

# string representation of `attribute` is either exactly

616

# `value` or starts with `value` and then a dash.

617

def _is_or_starts_with_dash(element):

618

attribute_value = element._attr_value_as_string(attribute, '')

619

return (attribute_value == value or attribute_value.startswith(

620

value + '-'))

621

return _is_or_starts_with_dash

622

else:

623

return lambda el: el.has_attr(attribute)

624

625

# Old non-property versions of the generators, for backwards

626

# compatibility with BS3.

627

def nextGenerator(self):

628

return self.next_elements

629

630

def nextSiblingGenerator(self):

631

return self.next_siblings

632

633

def previousGenerator(self):

634

return self.previous_elements

635

636

def previousSiblingGenerator(self):

637

return self.previous_siblings

638

639

def parentGenerator(self):

640

return self.parents

641

642

643

class NavigableString(unicode, PageElement):

644

645

PREFIX = ''

646

SUFFIX = ''

647

648

def __new__(cls, value):

649

"""Create a new NavigableString.

650

651

When unpickling a NavigableString, this method is called with

652

the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be

653

passed in to the superclass's __new__ or the superclass won't know

654

how to handle non-ASCII characters.

655

"""

656

if isinstance(value, unicode):

657

return unicode.__new__(cls, value)

658

return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)

659

660

def __copy__(self):

661

return self

662

663

def __getnewargs__(self):

664

return (unicode(self),)

665

666

def __getattr__(self, attr):

667

"""text.string gives you text. This is for backwards

668

compatibility for Navigable*String, but for CData* it lets you

669

get the string without the CData wrapper."""

670

if attr == 'string':

671

return self

672

else:

673

raise AttributeError(

674

"'%s' object has no attribute '%s'" % (

675

self.__class__.__name__, attr))

676

677

def output_ready(self, formatter="minimal"):

678

output = self.format_string(self, formatter)

679

return self.PREFIX + output + self.SUFFIX

680

681

@property

682

def name(self):

683

return None

684

685

@name.setter

686

def name(self, name):

687

raise AttributeError("A NavigableString cannot be given a name.")

688

689

class PreformattedString(NavigableString):

690

"""A NavigableString not subject to the normal formatting rules.

691

692

The string will be passed into the formatter (to trigger side effects),

693

but the return value will be ignored.

694

"""

695

696

def output_ready(self, formatter="minimal"):

697

"""CData strings are passed into the formatter.

698

But the return value is ignored."""

699

self.format_string(self, formatter)

700

return self.PREFIX + self + self.SUFFIX

701

702

class CData(PreformattedString):

703

704

PREFIX = u'<![CDATA['

705

SUFFIX = u']]>'

706

707

class ProcessingInstruction(PreformattedString):

708

709

PREFIX = u'<?'

710

SUFFIX = u'?>'

711

712

class Comment(PreformattedString):

713

714

PREFIX = u'<!--'

715

SUFFIX = u'-->'

716

717

718

class Declaration(PreformattedString):

719

PREFIX = u'<!'

720

SUFFIX = u'!>'

721

722

723

class Doctype(PreformattedString):

724

725

@classmethod

726

def for_name_and_ids(cls, name, pub_id, system_id):

727

value = name or ''

728

if pub_id is not None:

729

value += ' PUBLIC "%s"' % pub_id

730

if system_id is not None:

731

value += ' "%s"' % system_id

732

elif system_id is not None:

733

value += ' SYSTEM "%s"' % system_id

734

735

return Doctype(value)

736

737

PREFIX = u'<!DOCTYPE '

738

SUFFIX = u'>\n'

739

740

741

class Tag(PageElement):

742

743

"""Represents a found HTML tag with its attributes and contents."""

744

745

def __init__(self, parser=None, builder=None, name=None, namespace=None,

746

prefix=None, attrs=None, parent=None, previous=None):

747

"Basic constructor."

748

749

if parser is None:

750

self.parser_class = None

751

else:

752

# We don't actually store the parser object: that lets extracted

753

# chunks be garbage-collected.

754

self.parser_class = parser.__class__

755

if name is None:

756

raise ValueError("No value provided for new tag's name.")

757

self.name = name

758

self.namespace = namespace

759

self.prefix = prefix

760

if attrs is None:

761

attrs = {}

762

elif attrs and builder.cdata_list_attributes:

763

attrs = builder._replace_cdata_list_attribute_values(

764

self.name, attrs)

765

else:

766

attrs = dict(attrs)

767

self.attrs = attrs

768

self.contents = []

769

self.setup(parent, previous)

770

self.hidden = False

771

772

# Set up any substitutions, such as the charset in a META tag.

773

if builder is not None:

774

builder.set_up_substitutions(self)

775

self.can_be_empty_element = builder.can_be_empty_element(name)

776

else:

777

self.can_be_empty_element = False

778

779

parserClass = _alias("parser_class") # BS3

780

781

@property

782

def is_empty_element(self):

783

"""Is this tag an empty-element tag? (aka a self-closing tag)

784

785

A tag that has contents is never an empty-element tag.

786

787

A tag that has no contents may or may not be an empty-element

788

tag. It depends on the builder used to create the tag. If the

789

builder has a designated list of empty-element tags, then only

790

a tag whose name shows up in that list is considered an

791

empty-element tag.

792

793

If the builder has no designated list of empty-element tags,

794

then any tag with no contents is an empty-element tag.

795

"""

796

return len(self.contents) == 0 and self.can_be_empty_element

797

isSelfClosing = is_empty_element # BS3

798

799

@property

800

def string(self):

801

"""Convenience property to get the single string within this tag.

802

803

:Return: If this tag has a single string child, return value

804

is that string. If this tag has no children, or more than one

805

child, return value is None. If this tag has one child tag,

806

return value is the 'string' attribute of the child tag,

807

recursively.

808

"""

809

if len(self.contents) != 1:

810

return None

811

child = self.contents[0]

812

if isinstance(child, NavigableString):

813

return child

814

return child.string

815

816

@string.setter

817

def string(self, string):

818

self.clear()

819

self.append(string.__class__(string))

820

821

def _all_strings(self, strip=False, types=(NavigableString, CData)):

822

"""Yield all strings of certain classes, possibly stripping them.

823

824

By default, yields only NavigableString and CData objects. So

825

no comments, processing instructions, etc.

826

"""

827

for descendant in self.descendants:

828

if (

829

(types is None and not isinstance(descendant, NavigableString))

830

or

831

(types is not None and type(descendant) not in types)):

832

continue

833

if strip:

834

descendant = descendant.strip()

835

if len(descendant) == 0:

836

continue

837

yield descendant

838

839

strings = property(_all_strings)

840

841

@property

842

def stripped_strings(self):

843

for string in self._all_strings(True):

844

yield string

845

846

def get_text(self, separator=u"", strip=False,

847

types=(NavigableString, CData)):

848

"""

849

Get all child strings, concatenated using the given separator.

850

"""

851

return separator.join([s for s in self._all_strings(

852

strip, types=types)])

853

getText = get_text

854

text = property(get_text)

855

856

def decompose(self):

857

"""Recursively destroys the contents of this tree."""

858

self.extract()

859

i = self

860

while i is not None:

861

next = i.next_element

862

i.__dict__.clear()

863

i.contents = []

864

i = next

865

866

def clear(self, decompose=False):

867

"""

868

Extract all children. If decompose is True, decompose instead.

869

"""

870

if decompose:

871

for element in self.contents[:]:

872

if isinstance(element, Tag):

873

element.decompose()

874

else:

875

element.extract()

876

else:

877

for element in self.contents[:]:

878

element.extract()

879

880

def index(self, element):

881

"""

882

Find the index of a child by identity, not value. Avoids issues with

883

tag.contents.index(element) getting the index of equal elements.

884

"""

885

for i, child in enumerate(self.contents):

886

if child is element:

887

return i

888

raise ValueError("Tag.index: element not in tag")

889

890

def get(self, key, default=None):

891

"""Returns the value of the 'key' attribute for the tag, or

892

the value given for 'default' if it doesn't have that

893

attribute."""

894

return self.attrs.get(key, default)

895

896

def has_attr(self, key):

897

return key in self.attrs

898

899

def __hash__(self):

900

return str(self).__hash__()

901

902

def __getitem__(self, key):

903

"""tag[key] returns the value of the 'key' attribute for the tag,

904

and throws an exception if it's not there."""

905

return self.attrs[key]

906

907

def __iter__(self):

908

"Iterating over a tag iterates over its contents."

909

return iter(self.contents)

910

911

def __len__(self):

912

"The length of a tag is the length of its list of contents."

913

return len(self.contents)

914

915

def __contains__(self, x):

916

return x in self.contents

917

918

def __nonzero__(self):

919

"A tag is non-None even if it has no contents."

920

return True

921

922

def __setitem__(self, key, value):

923

"""Setting tag[key] sets the value of the 'key' attribute for the

924

tag."""

925

self.attrs[key] = value

926

927

def __delitem__(self, key):

928

"Deleting tag[key] deletes all 'key' attributes for the tag."

929

self.attrs.pop(key, None)

930

931

def __call__(self, *args, **kwargs):

932

"""Calling a tag like a function is the same as calling its

933

find_all() method. Eg. tag('a') returns a list of all the A tags

934

found within this tag."""

935

return self.find_all(*args, **kwargs)

936

937

def __getattr__(self, tag):

938

#print "Getattr %s.%s" % (self.__class__, tag)

939

if len(tag) > 3 and tag.endswith('Tag'):

940

# BS3: soup.aTag -> "soup.find("a")

941

tag_name = tag[:-3]

942

warnings.warn(

943

'.%sTag is deprecated, use .find("%s") instead.' % (

944

tag_name, tag_name))

945

return self.find(tag_name)

946

# We special case contents to avoid recursion.

947

elif not tag.startswith("__") and not tag=="contents":

948

return self.find(tag)

949

raise AttributeError(

950

"'%s' object has no attribute '%s'" % (self.__class__, tag))

951

952

def __eq__(self, other):

953

"""Returns true iff this tag has the same name, the same attributes,

954

and the same contents (recursively) as the given tag."""

955

if self is other:

956

return True

957

if (not hasattr(other, 'name') or

958

not hasattr(other, 'attrs') or

959

not hasattr(other, 'contents') or

960

self.name != other.name or

961

self.attrs != other.attrs or

962

len(self) != len(other)):

963

return False

964

for i, my_child in enumerate(self.contents):

965

if my_child != other.contents[i]:

966

return False

967

return True

968

969

def __ne__(self, other):

970

"""Returns true iff this tag is not identical to the other tag,

971

as defined in __eq__."""

972

return not self == other

973

974

def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):

975

"""Renders this tag as a string."""

976

return self.encode(encoding)

977

978

def __unicode__(self):

979

return self.decode()

980

981

def __str__(self):

982

return self.encode()

983

984

if PY3K:

985

__str__ = __repr__ = __unicode__

986

987

def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,

988

indent_level=None, formatter="minimal",

989

errors="xmlcharrefreplace"):

990

# Turn the data structure into Unicode, then encode the

991

# Unicode.

992

u = self.decode(indent_level, encoding, formatter)

993

return u.encode(encoding, errors)

994

995

def _should_pretty_print(self, indent_level):

996

"""Should this tag be pretty-printed?"""

997

return (

998

indent_level is not None and

999

(self.name not in HTMLAwareEntitySubstitution.preformatted_tags

1000

or self._is_xml))

1001

1002

def decode(self, indent_level=None,

1003

eventual_encoding=DEFAULT_OUTPUT_ENCODING,

1004

formatter="minimal"):

1005

"""Returns a Unicode representation of this tag and its contents.

1006

1007

:param eventual_encoding: The tag is destined to be

1008

encoded into this encoding. This method is _not_

1009

responsible for performing that encoding. This information

1010

is passed in so that it can be substituted in if the

1011

document contains a <META> tag that mentions the document's

1012

encoding.

1013

"""

1014

1015

# First off, turn a string formatter into a function. This

1016

# will stop the lookup from happening over and over again.

1017

if not callable(formatter):

1018

formatter = self._formatter_for_name(formatter)

1019

1020

attrs = []

1021

if self.attrs:

1022

for key, val in sorted(self.attrs.items()):

1023

if val is None:

1024

decoded = key

1025

else:

1026

if isinstance(val, list) or isinstance(val, tuple):

1027

val = ' '.join(val)

1028

elif not isinstance(val, basestring):

1029

val = unicode(val)

1030

elif (

1031

isinstance(val, AttributeValueWithCharsetSubstitution)

1032

and eventual_encoding is not None):

1033

val = val.encode(eventual_encoding)

1034

1035

text = self.format_string(val, formatter)

1036

decoded = (

1037

unicode(key) + '='

1038

+ EntitySubstitution.quoted_attribute_value(text))

1039

attrs.append(decoded)

1040

close = ''

1041

closeTag = ''

1042

1043

prefix = ''

1044

if self.prefix:

1045

prefix = self.prefix + ":"

1046

1047

if self.is_empty_element:

1048

close = '/'

1049

else:

1050

closeTag = '</%s%s>' % (prefix, self.name)

1051

1052

pretty_print = self._should_pretty_print(indent_level)

1053

space = ''

1054

indent_space = ''

1055

if indent_level is not None:

1056

indent_space = (' ' * (indent_level - 1))

1057

if pretty_print:

1058

space = indent_space

1059

indent_contents = indent_level + 1

1060

else:

1061

indent_contents = None

1062

contents = self.decode_contents(

1063

indent_contents, eventual_encoding, formatter)

1064

1065

if self.hidden:

1066

# This is the 'document root' object.

1067

s = contents

1068

else:

1069

s = []

1070

attribute_string = ''

1071

if attrs:

1072

attribute_string = ' ' + ' '.join(attrs)

1073

if indent_level is not None:

1074

# Even if this particular tag is not pretty-printed,

1075

# we should indent up to the start of the tag.

1076

s.append(indent_space)

1077

s.append('<%s%s%s%s>' % (

1078

prefix, self.name, attribute_string, close))

1079

if pretty_print:

1080

s.append("\n")

1081

s.append(contents)

1082

if pretty_print and contents and contents[-1] != "\n":

1083

s.append("\n")

1084

if pretty_print and closeTag:

1085

s.append(space)

1086

s.append(closeTag)

1087

if indent_level is not None and closeTag and self.next_sibling:

1088

# Even if this particular tag is not pretty-printed,

1089

# we're now done with the tag, and we should add a

1090

# newline if appropriate.

1091

s.append("\n")

1092

s = ''.join(s)

1093

return s

1094

1095

def prettify(self, encoding=None, formatter="minimal"):

1096

if encoding is None:

1097

return self.decode(True, formatter=formatter)

1098

else:

1099

return self.encode(encoding, True, formatter=formatter)

1100

1101

def decode_contents(self, indent_level=None,

1102

eventual_encoding=DEFAULT_OUTPUT_ENCODING,

1103

formatter="minimal"):

1104

"""Renders the contents of this tag as a Unicode string.

1105

1106

:param eventual_encoding: The tag is destined to be

1107

encoded into this encoding. This method is _not_

1108

responsible for performing that encoding. This information

1109

is passed in so that it can be substituted in if the

1110

document contains a <META> tag that mentions the document's

1111

encoding.

1112

"""

1113

# First off, turn a string formatter into a function. This

1114

# will stop the lookup from happening over and over again.

1115

if not callable(formatter):

1116

formatter = self._formatter_for_name(formatter)

1117

1118

pretty_print = (indent_level is not None)

1119

s = []

1120

for c in self:

1121

text = None

1122

if isinstance(c, NavigableString):

1123

text = c.output_ready(formatter)

1124

elif isinstance(c, Tag):

1125

s.append(c.decode(indent_level, eventual_encoding,

1126

formatter))

1127

if text and indent_level and not self.name == 'pre':

1128

text = text.strip()

1129

if text:

1130

if pretty_print and not self.name == 'pre':

1131

s.append(" " * (indent_level - 1))

1132

s.append(text)

1133

if pretty_print and not self.name == 'pre':

1134

s.append("\n")

1135

return ''.join(s)

1136

1137

def encode_contents(

1138

self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,

1139

formatter="minimal"):

1140

"""Renders the contents of this tag as a bytestring."""

1141

contents = self.decode_contents(indent_level, encoding, formatter)

1142

return contents.encode(encoding)

1143

1144

# Old method for BS3 compatibility

1145

def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,

1146

prettyPrint=False, indentLevel=0):

1147

if not prettyPrint:

1148

indentLevel = None

1149

return self.encode_contents(

1150

indent_level=indentLevel, encoding=encoding)

1151

1152

#Soup methods

1153

1154

def find(self, name=None, attrs={}, recursive=True, text=None,

1155

**kwargs):

1156

"""Return only the first child of this Tag matching the given

1157

criteria."""

1158

r = None

1159

l = self.find_all(name, attrs, recursive, text, 1, **kwargs)

1160

if l:

1161

r = l[0]

1162

return r

1163

findChild = find

1164

1165

def find_all(self, name=None, attrs={}, recursive=True, text=None,

1166

limit=None, **kwargs):

1167

"""Extracts a list of Tag objects that match the given

1168

criteria. You can specify the name of the Tag and any

1169

attributes you want the Tag to have.

1170

1171

The value of a key-value pair in the 'attrs' map can be a

1172

string, a list of strings, a regular expression object, or a

1173

callable that takes a string and returns whether or not the

1174

string matches for some custom definition of 'matches'. The

1175

same is true of the tag name."""

1176

1177

generator = self.descendants

1178

if not recursive:

1179

generator = self.children

1180

return self._find_all(name, attrs, text, limit, generator, **kwargs)

1181

findAll = find_all # BS3

1182

findChildren = find_all # BS2

1183

1184

#Generator methods

1185

@property

1186

def children(self):

1187

# return iter() to make the purpose of the method clear

1188

return iter(self.contents) # XXX This seems to be untested.

1189

1190

@property

1191

def descendants(self):

1192

if not len(self.contents):

1193

return

1194

stopNode = self._last_descendant().next_element

1195

current = self.contents[0]

1196

while current is not stopNode:

1197

yield current

1198

current = current.next_element

1199

1200

# CSS selector code

1201

1202

_selector_combinators = ['>', '+', '~']

1203

_select_debug = False

1204

def select(self, selector, _candidate_generator=None):

1205

"""Perform a CSS selection operation on the current element."""

1206

tokens = selector.split()

1207

current_context = [self]

1208

1209

if tokens[-1] in self._selector_combinators:

1210

raise ValueError(

1211

'Final combinator "%s" is missing an argument.' % tokens[-1])

1212

if self._select_debug:

1213

print 'Running CSS selector "%s"' % selector

1214

for index, token in enumerate(tokens):

1215

if self._select_debug:

1216

print ' Considering token "%s"' % token

1217

recursive_candidate_generator = None

1218

tag_name = None

1219

if tokens[index-1] in self._selector_combinators:

1220

# This token was consumed by the previous combinator. Skip it.

1221

if self._select_debug:

1222

print ' Token was consumed by the previous combinator.'

1223

continue

1224

# Each operation corresponds to a checker function, a rule

1225

# for determining whether a candidate matches the

1226

# selector. Candidates are generated by the active

1227

# iterator.

1228

checker = None

1229

1230

m = self.attribselect_re.match(token)

1231

if m is not None:

1232

# Attribute selector

1233

tag_name, attribute, operator, value = m.groups()

1234

checker = self._attribute_checker(operator, attribute, value)

1235

1236

elif '#' in token:

1237

# ID selector

1238

tag_name, tag_id = token.split('#', 1)

1239

def id_matches(tag):

1240

return tag.get('id', None) == tag_id

1241

checker = id_matches

1242

1243

elif '.' in token:

1244

# Class selector

1245

tag_name, klass = token.split('.', 1)

1246

classes = set(klass.split('.'))

1247

def classes_match(candidate):

1248

return classes.issubset(candidate.get('class', []))

1249

checker = classes_match

1250

1251

elif ':' in token:

1252

# Pseudo-class

1253

tag_name, pseudo = token.split(':', 1)

1254

if tag_name == '':

1255

raise ValueError(

1256

"A pseudo-class must be prefixed with a tag name.")

1257

pseudo_attributes = re.match('([a-zA-Z\d-]+)$([a-zA-Z\d]+)$', pseudo)

1258

found = []

1259

if pseudo_attributes is not None:

1260

pseudo_type, pseudo_value = pseudo_attributes.groups()

1261

if pseudo_type == 'nth-of-type':

1262

try:

1263

pseudo_value = int(pseudo_value)

1264

except:

1265

raise NotImplementedError(

1266

'Only numeric values are currently supported for the nth-of-type pseudo-class.')

1267

if pseudo_value < 1:

1268

raise ValueError(

1269

'nth-of-type pseudo-class value must be at least 1.')

1270

class Counter(object):

1271

def __init__(self, destination):

1272

self.count = 0

1273

self.destination = destination

1274

1275

def nth_child_of_type(self, tag):

1276

self.count += 1

1277

if self.count == self.destination:

1278

return True

1279

if self.count > self.destination:

1280

# Stop the generator that's sending us

1281

# these things.

1282

raise StopIteration()

1283

return False

1284

checker = Counter(pseudo_value).nth_child_of_type

1285

else:

1286

raise NotImplementedError(

1287

'Only the following pseudo-classes are implemented: nth-of-type.')

1288

1289

elif token == '*':

1290

# Star selector -- matches everything

1291

pass

1292

elif token == '>':

1293

# Run the next token as a CSS selector against the

1294

# direct children of each tag in the current context.

1295

recursive_candidate_generator = lambda tag: tag.children

1296

elif token == '~':

1297

# Run the next token as a CSS selector against the

1298

# siblings of each tag in the current context.

1299

recursive_candidate_generator = lambda tag: tag.next_siblings

1300

elif token == '+':

1301

# For each tag in the current context, run the next

1302

# token as a CSS selector against the tag's next

1303

# sibling that's a tag.

1304

def next_tag_sibling(tag):

1305

yield tag.find_next_sibling(True)

1306

recursive_candidate_generator = next_tag_sibling

1307

1308

elif self.tag_name_re.match(token):

1309

# Just a tag name.

1310

tag_name = token

1311

else:

1312

raise ValueError(

1313

'Unsupported or invalid CSS selector: "%s"' % token)

1314

1315

if recursive_candidate_generator:

1316

# This happens when the selector looks like "> foo".

1317

#

1318

# The generator calls select() recursively on every

1319

# member of the current context, passing in a different

1320

# candidate generator and a different selector.

1321

#

1322

# In the case of "> foo", the candidate generator is

1323

# one that yields a tag's direct children (">"), and

1324

# the selector is "foo".

1325

next_token = tokens[index+1]

1326

def recursive_select(tag):

1327

if self._select_debug:

1328

print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)

1329

print '-' * 40

1330

for i in tag.select(next_token, recursive_candidate_generator):

1331

if self._select_debug:

1332

print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)

1333

yield i

1334

if self._select_debug:

1335

print '-' * 40

1336

_use_candidate_generator = recursive_select

1337

elif _candidate_generator is None:

1338

# By default, a tag's candidates are all of its

1339

# children. If tag_name is defined, only yield tags

1340

# with that name.

1341

if self._select_debug:

1342

if tag_name:

1343

check = "[any]"

1344

else:

1345

check = tag_name

1346

print ' Default candidate generator, tag name="%s"' % check

1347

if self._select_debug:

1348

# This is redundant with later code, but it stops

1349

# a bunch of bogus tags from cluttering up the

1350

# debug log.

1351

def default_candidate_generator(tag):

1352

for child in tag.descendants:

1353

if not isinstance(child, Tag):

1354

continue

1355

if tag_name and not child.name == tag_name:

1356

continue

1357

yield child

1358

_use_candidate_generator = default_candidate_generator

1359

else:

1360

_use_candidate_generator = lambda tag: tag.descendants

1361

else:

1362

_use_candidate_generator = _candidate_generator

1363

1364

new_context = []

1365

new_context_ids = set([])

1366

for tag in current_context:

1367

if self._select_debug:

1368

print " Running candidate generator on %s %s" % (

1369

tag.name, repr(tag.attrs))

1370

for candidate in _use_candidate_generator(tag):

1371

if not isinstance(candidate, Tag):

1372

continue

1373

if tag_name and candidate.name != tag_name:

1374

continue

1375

if checker is not None:

1376

try:

1377

result = checker(candidate)

1378

except StopIteration:

1379

# The checker has decided we should no longer

1380

# run the generator.

1381

break

1382

if checker is None or result:

1383

if self._select_debug:

1384

print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))

1385

if id(candidate) not in new_context_ids:

1386

# If a tag matches a selector more than once,

1387

# don't include it in the context more than once.

1388

new_context.append(candidate)

1389

new_context_ids.add(id(candidate))

1390

elif self._select_debug:

1391

print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))

1392

1393

current_context = new_context

1394

1395

if self._select_debug:

1396

print "Final verdict:"

1397

for i in current_context:

1398

print " %s %s" % (i.name, i.attrs)

1399

return current_context

1400

1401

# Old names for backwards compatibility

1402

def childGenerator(self):

1403

return self.children

1404

1405

def recursiveChildGenerator(self):

1406

return self.descendants

1407

1408

def has_key(self, key):

1409

"""This was kind of misleading because has_key() (attributes)

1410

was different from __in__ (contents). has_key() is gone in

1411

Python 3, anyway."""

1412

warnings.warn('has_key is deprecated. Use has_attr("%s") instead.' % (

1413

key))

1414

return self.has_attr(key)

1415

1416

# Next, a couple classes to represent queries and their results.

1417

class SoupStrainer(object):

1418

"""Encapsulates a number of ways of matching a markup element (tag or

1419

text)."""

1420

1421

def __init__(self, name=None, attrs={}, text=None, **kwargs):

1422

self.name = self._normalize_search_value(name)

1423

if not isinstance(attrs, dict):

1424

# Treat a non-dict value for attrs as a search for the 'class'

1425

# attribute.

1426

kwargs['class'] = attrs

1427

attrs = None

1428

1429

if 'class_' in kwargs:

1430

# Treat class_="foo" as a search for the 'class'

1431

# attribute, overriding any non-dict value for attrs.

1432

kwargs['class'] = kwargs['class_']

1433

del kwargs['class_']

1434

1435

if kwargs:

1436

if attrs:

1437

attrs = attrs.copy()

1438

attrs.update(kwargs)

1439

else:

1440

attrs = kwargs

1441

normalized_attrs = {}

1442

for key, value in attrs.items():

1443

normalized_attrs[key] = self._normalize_search_value(value)

1444

1445

self.attrs = normalized_attrs

1446

self.text = self._normalize_search_value(text)

1447

1448

def _normalize_search_value(self, value):

1449

# Leave it alone if it's a Unicode string, a callable, a

1450

# regular expression, a boolean, or None.

1451

if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match')

1452

or isinstance(value, bool) or value is None):

1453

return value

1454

1455

# If it's a bytestring, convert it to Unicode, treating it as UTF-8.

1456

if isinstance(value, bytes):

1457

return value.decode("utf8")

1458

1459

# If it's listlike, convert it into a list of strings.

1460

if hasattr(value, '__iter__'):

1461

new_value = []

1462

for v in value:

1463

if (hasattr(v, '__iter__') and not isinstance(v, bytes)

1464

and not isinstance(v, unicode)):

1465

# This is almost certainly the user's mistake. In the

1466

# interests of avoiding infinite loops, we'll let

1467

# it through as-is rather than doing a recursive call.

1468

new_value.append(v)

1469

else:

1470

new_value.append(self._normalize_search_value(v))

1471

return new_value

1472

1473

# Otherwise, convert it into a Unicode string.

1474

# The unicode(str()) thing is so this will do the same thing on Python 2

1475

# and Python 3.

1476

return unicode(str(value))

1477

1478

def __str__(self):

1479

if self.text:

1480

return self.text

1481

else:

1482

return "%s|%s" % (self.name, self.attrs)

1483

1484

def search_tag(self, markup_name=None, markup_attrs={}):

1485

found = None

1486

markup = None

1487

if isinstance(markup_name, Tag):

1488

markup = markup_name

1489

markup_attrs = markup

1490

call_function_with_tag_data = (

1491

isinstance(self.name, collections.Callable)

1492

and not isinstance(markup_name, Tag))

1493

1494

if ((not self.name)

1495

or call_function_with_tag_data

1496

or (markup and self._matches(markup, self.name))

1497

or (not markup and self._matches(markup_name, self.name))):

1498

if call_function_with_tag_data:

1499

match = self.name(markup_name, markup_attrs)

1500

else:

1501

match = True

1502

markup_attr_map = None

1503

for attr, match_against in list(self.attrs.items()):

1504

if not markup_attr_map:

1505

if hasattr(markup_attrs, 'get'):

1506

markup_attr_map = markup_attrs

1507

else:

1508

markup_attr_map = {}

1509

for k, v in markup_attrs:

1510

markup_attr_map[k] = v

1511

attr_value = markup_attr_map.get(attr)

1512

if not self._matches(attr_value, match_against):

1513

match = False

1514

break

1515

if match:

1516

if markup:

1517

found = markup

1518

else:

1519

found = markup_name

1520

if found and self.text and not self._matches(found.string, self.text):

1521

found = None

1522

return found

1523

searchTag = search_tag

1524

1525

def search(self, markup):

1526

# print 'looking for %s in %s' % (self, markup)

1527

found = None

1528

# If given a list of items, scan it for a text element that

1529

# matches.

1530

if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)):

1531

for element in markup:

1532

if isinstance(element, NavigableString) \

1533

and self.search(element):

1534

found = element

1535

break

1536

# If it's a Tag, make sure its name or attributes match.

1537

# Don't bother with Tags if we're searching for text.

1538

elif isinstance(markup, Tag):

1539

if not self.text or self.name or self.attrs:

1540

found = self.search_tag(markup)

1541

# If it's text, make sure the text matches.

1542

elif isinstance(markup, NavigableString) or \

1543

isinstance(markup, basestring):

1544

if not self.name and not self.attrs and self._matches(markup, self.text):

1545

found = markup

1546

else:

1547

raise Exception(

1548

"I don't know how to match against a %s" % markup.__class__)

1549

return found

1550

1551

def _matches(self, markup, match_against):

1552

# print u"Matching %s against %s" % (markup, match_against)

1553

result = False

1554

if isinstance(markup, list) or isinstance(markup, tuple):

1555

# This should only happen when searching a multi-valued attribute

1556

# like 'class'.

1557

if (isinstance(match_against, unicode)

1558

and ' ' in match_against):

1559

# A bit of a special case. If they try to match "foo

1560

# bar" on a multivalue attribute's value, only accept

1561

# the literal value "foo bar"

1562

#

1563

# XXX This is going to be pretty slow because we keep

1564

# splitting match_against. But it shouldn't come up

1565

# too often.

1566

return (whitespace_re.split(match_against) == markup)

1567

else:

1568

for item in markup:

1569

if self._matches(item, match_against):

1570

return True

1571

return False

1572

1573

if match_against is True:

1574

# True matches any non-None value.

1575

return markup is not None

1576

1577

if isinstance(match_against, collections.Callable):

1578

return match_against(markup)

1579

1580

# Custom callables take the tag as an argument, but all

1581

# other ways of matching match the tag name as a string.

1582

if isinstance(markup, Tag):

1583

markup = markup.name

1584

1585

# Ensure that `markup` is either a Unicode string, or None.

1586

markup = self._normalize_search_value(markup)

1587

1588

if markup is None:

1589

# None matches None, False, an empty string, an empty list, and so on.

1590

return not match_against

1591

1592

if isinstance(match_against, unicode):

1593

# Exact string match

1594

return markup == match_against

1595

1596

if hasattr(match_against, 'match'):

1597

# Regexp match

1598

return match_against.search(markup)

1599

1600

if hasattr(match_against, '__iter__'):

1601

# The markup must be an exact match against something

1602

# in the iterable.

1603

return markup in match_against

1604

1605

1606

class ResultSet(list):

1607

"""A ResultSet is just a list that keeps track of the SoupStrainer

1608

that created it."""

1609

def __init__(self, source, result=()):

1610

super(ResultSet, self).__init__(result)

1611

self.source = source