~jonas-drange/ubuntu-start-page/1252899-mobile-friendly

« back to all changes in this revision

Viewing changes to src/BeautifulSoup/BeautifulSoup.py

Committer: Matthew Nuzum
Date: 2008-04-18 01:58:53 UTC
Revision ID: matthew.nuzum@canonical.com-20080418015853-2b8rf979z2c2exxl

adding files

files added:
.pydevproject

localize.py

offline-files.tgz

prepare-source.sh

source

source/index.html.ar

source/index.html.bg

source/index.html.bn

source/index.html.ca

source/index.html.cs

source/index.html.da

source/index.html.de

source/index.html.el

source/index.html.en

source/index.html.en-gb

source/index.html.es

source/index.html.es-ar

source/index.html.es-es

source/index.html.eu

source/index.html.fi

source/index.html.fr

source/index.html.fr-ca

source/index.html.ga

source/index.html.gl

source/index.html.gu

source/index.html.he

source/index.html.hu

source/index.html.it

source/index.html.ja

source/index.html.ka

source/index.html.ko

source/index.html.ku

source/index.html.lt

source/index.html.mk

source/index.html.mn

source/index.html.nb

source/index.html.nl

source/index.html.pa

source/index.html.pl

source/index.html.pt

source/index.html.pt-br

source/index.html.ro

source/index.html.ru

source/index.html.sk

source/index.html.sl

source/index.html.sv

source/index.html.tr

source/index.html.tw

source/index.html.zh

src/BeautifulSoup

src/BeautifulSoup/BeautifulSoup.py

src/BeautifulSoup/BeautifulSoupTests.py

src/BeautifulSoup/PKG-INFO

src/BeautifulSoup/__init__.py

src/BeautifulSoup/setup.py

src/Mako-0.1.9

src/Mako-0.1.9/CHANGES

src/Mako-0.1.9/LICENSE

src/Mako-0.1.9/MANIFEST.in

src/Mako-0.1.9/PKG-INFO

src/Mako-0.1.9/README

src/Mako-0.1.9/doc

src/Mako-0.1.9/doc/build

src/Mako-0.1.9/doc/build/content

src/Mako-0.1.9/doc/build/content/caching.txt

src/Mako-0.1.9/doc/build/content/defs.txt

src/Mako-0.1.9/doc/build/content/documentation.html

src/Mako-0.1.9/doc/build/content/filtering.txt

src/Mako-0.1.9/doc/build/content/index.html

src/Mako-0.1.9/doc/build/content/inheritance.txt

src/Mako-0.1.9/doc/build/content/namespaces.txt

src/Mako-0.1.9/doc/build/content/runtime.txt

src/Mako-0.1.9/doc/build/content/syntax.txt

src/Mako-0.1.9/doc/build/content/unicode.txt

src/Mako-0.1.9/doc/build/content/usage.txt

src/Mako-0.1.9/doc/build/genhtml.py

src/Mako-0.1.9/doc/build/lib

src/Mako-0.1.9/doc/build/lib/markdown.py

src/Mako-0.1.9/doc/build/lib/toc.py

src/Mako-0.1.9/doc/build/read_markdown.py

src/Mako-0.1.9/doc/build/templates

src/Mako-0.1.9/doc/build/templates/autohandler

src/Mako-0.1.9/doc/build/templates/base.html

src/Mako-0.1.9/doc/build/templates/content_layout.html

src/Mako-0.1.9/doc/build/templates/formatting.html

src/Mako-0.1.9/doc/build/templates/nav.html

src/Mako-0.1.9/doc/build/templates/toc.html

src/Mako-0.1.9/doc/caching.html

src/Mako-0.1.9/doc/defs.html

src/Mako-0.1.9/doc/docs.css

src/Mako-0.1.9/doc/documentation.html

src/Mako-0.1.9/doc/filtering.html

src/Mako-0.1.9/doc/highlight.css

src/Mako-0.1.9/doc/index.html

src/Mako-0.1.9/doc/inheritance.html

src/Mako-0.1.9/doc/makotemplates.txt

src/Mako-0.1.9/doc/namespaces.html

src/Mako-0.1.9/doc/runtime.html

src/Mako-0.1.9/doc/syntax.html

src/Mako-0.1.9/doc/unicode.html

src/Mako-0.1.9/doc/usage.html

src/Mako-0.1.9/examples

src/Mako-0.1.9/examples/bench

src/Mako-0.1.9/examples/bench/basic.py

src/Mako-0.1.9/examples/bench/cheetah

src/Mako-0.1.9/examples/bench/cheetah/footer.tmpl

src/Mako-0.1.9/examples/bench/cheetah/header.tmpl

src/Mako-0.1.9/examples/bench/cheetah/template.tmpl

src/Mako-0.1.9/examples/bench/django

src/Mako-0.1.9/examples/bench/django/templates

src/Mako-0.1.9/examples/bench/django/templates/base.html

src/Mako-0.1.9/examples/bench/django/templates/template.html

src/Mako-0.1.9/examples/bench/django/templatetags

src/Mako-0.1.9/examples/bench/django/templatetags/__init__.py

src/Mako-0.1.9/examples/bench/django/templatetags/bench.py

src/Mako-0.1.9/examples/bench/genshi

src/Mako-0.1.9/examples/bench/genshi/base.html

src/Mako-0.1.9/examples/bench/genshi/template.html

src/Mako-0.1.9/examples/bench/kid

src/Mako-0.1.9/examples/bench/kid/base.kid

src/Mako-0.1.9/examples/bench/kid/template.kid

src/Mako-0.1.9/examples/bench/mako

src/Mako-0.1.9/examples/bench/mako/footer.html

src/Mako-0.1.9/examples/bench/mako/header.html

src/Mako-0.1.9/examples/bench/mako/template.html

src/Mako-0.1.9/examples/bench/mako_inheritance

src/Mako-0.1.9/examples/bench/mako_inheritance/base.html

src/Mako-0.1.9/examples/bench/mako_inheritance/template.html

src/Mako-0.1.9/examples/bench/myghty

src/Mako-0.1.9/examples/bench/myghty/base.myt

src/Mako-0.1.9/examples/bench/myghty/template.myt

src/Mako-0.1.9/examples/wsgi

src/Mako-0.1.9/examples/wsgi/htdocs

src/Mako-0.1.9/examples/wsgi/htdocs/index.html

src/Mako-0.1.9/examples/wsgi/run_wsgi.py

src/Mako-0.1.9/examples/wsgi/templates

src/Mako-0.1.9/examples/wsgi/templates/root.html

src/Mako-0.1.9/lib

src/Mako-0.1.9/lib/Mako.egg-info

src/Mako-0.1.9/lib/Mako.egg-info/PKG-INFO

src/Mako-0.1.9/lib/Mako.egg-info/SOURCES.txt

src/Mako-0.1.9/lib/Mako.egg-info/dependency_links.txt

src/Mako-0.1.9/lib/Mako.egg-info/entry_points.txt

src/Mako-0.1.9/lib/Mako.egg-info/not-zip-safe

src/Mako-0.1.9/lib/Mako.egg-info/requires.txt

src/Mako-0.1.9/lib/Mako.egg-info/top_level.txt

src/Mako-0.1.9/setup.cfg

src/Mako-0.1.9/setup.py

src/Mako-0.1.9/test

src/Mako-0.1.9/test/__init__.py

src/Mako-0.1.9/test/alltests.py

src/Mako-0.1.9/test/ast.py

src/Mako-0.1.9/test/babelplugin.py

src/Mako-0.1.9/test/cache.py

src/Mako-0.1.9/test/call.py

src/Mako-0.1.9/test/def.py

src/Mako-0.1.9/test/exceptions_.py

src/Mako-0.1.9/test/filters.py

src/Mako-0.1.9/test/foo

src/Mako-0.1.9/test/foo/__init__.py

src/Mako-0.1.9/test/foo/test_ns.py

src/Mako-0.1.9/test/inheritance.py

src/Mako-0.1.9/test/lexer.py

src/Mako-0.1.9/test/lookup.py

src/Mako-0.1.9/test/lru.py

src/Mako-0.1.9/test/namespace.py

src/Mako-0.1.9/test/pygen.py

src/Mako-0.1.9/test/template.py

src/Mako-0.1.9/test/test_namespace.py

src/Mako-0.1.9/test/tgplugin.py

src/Mako-0.1.9/test/util.py

src/Mako-0.1.9/test_htdocs

src/Mako-0.1.9/test_htdocs/crlf.html

src/Mako-0.1.9/test_htdocs/gettext.mako

src/Mako-0.1.9/test_htdocs/internationalization.html

src/Mako-0.1.9/test_htdocs/read_unicode.html

src/__init__.py

src/mako

src/mako/__init__.py

src/mako/ast.py

src/mako/cache.py

src/mako/codegen.py

src/mako/exceptions.py

src/mako/ext

src/mako/ext/__init__.py

src/mako/ext/autohandler.py

src/mako/ext/babelplugin.py

src/mako/ext/preprocessors.py

src/mako/ext/pygmentplugin.py

src/mako/ext/turbogears.py

src/mako/filters.py

src/mako/lexer.py

src/mako/lookup.py

src/mako/parsetree.py

src/mako/pygen.py

src/mako/runtime.py

src/mako/template.py

src/mako/util.py

files removed:
8.04/.index.html

files renamed:
8.04/index.html => 8.04/index_template.html

files modified:
.project

start page layout.svg

Show diffs side-by-side

added added

removed removed

src/BeautifulSoup/BeautifulSoup.py

"""Beautiful Soup

Elixir and Tonic

"The Screen-Scraper's Friend"

http://www.crummy.com/software/BeautifulSoup/

Beautiful Soup parses a (possibly invalid) XML or HTML document into a

tree representation. It provides methods and Pythonic idioms that make

it easy to navigate, search, and modify the tree.

A well-formed XML/HTML document yields a well-formed data

structure. An ill-formed XML/HTML document yields a correspondingly

ill-formed data structure. If your document is only locally

well-formed, you can use this library to find and process the

well-formed part of it.

Beautiful Soup works with Python 2.2 and up. It has no external

dependencies, but you'll have more success at converting data to UTF-8

if you also install these three packages:

* chardet, for auto-detecting character encodings

http://chardet.feedparser.org/

* cjkcodecs and iconv_codec, which add more encodings to the ones supported

by stock Python.

http://cjkpython.i18n.org/

Beautiful Soup defines classes for two main parsing strategies:

* BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific

language that kind of looks like XML.

* BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid

or invalid. This class has web browser-like heuristics for

obtaining a sensible parse tree in the face of common HTML errors.

Beautiful Soup also defines a class (UnicodeDammit) for autodetecting

the encoding of an HTML or XML document, and converting it to

Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.

For more than you ever wanted to know about Beautiful Soup, see the

documentation:

http://www.crummy.com/software/BeautifulSoup/documentation.html

Here, have some legalese:

Redistribution and use in source and binary forms, with or without

modification, are permitted provided that the following conditions are

met:

* Redistributions of source code must retain the above copyright

notice, this list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above

disclaimer in the documentation and/or other materials provided

with the distribution.

* Neither the name of the the Beautiful Soup Consortium and All

Night Kosher Bakery nor the names of its contributors may be

used to endorse or promote products derived from this software

without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR

CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.

"""

from __future__ import generators

__author__ = "Leonard Richardson (leonardr@segfault.org)"

__version__ = "3.0.5"

__license__ = "New-style BSD"

from sgmllib import SGMLParser, SGMLParseError

import codecs

import types

import re

import sgmllib

try:

from htmlentitydefs import name2codepoint

except ImportError:

name2codepoint = {}

#This hack makes Beautiful Soup able to parse XML with namespaces

sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')

DEFAULT_OUTPUT_ENCODING = "utf-8"

100

101

# First, the classes that represent markup elements.

102

103

class PageElement:

104

"""Contains the navigational information for some part of the page

105

(either a tag or a piece of text)"""

106

107

def setup(self, parent=None, previous=None):

108

"""Sets up the initial relations between this element and

109

other elements."""

110

self.parent = parent

111

self.previous = previous

112

self.next = None

113

self.previousSibling = None

114

self.nextSibling = None

115

if self.parent and self.parent.contents:

116

self.previousSibling = self.parent.contents[-1]

117

self.previousSibling.nextSibling = self

118

119

def replaceWith(self, replaceWith):

120

oldParent = self.parent

121

myIndex = self.parent.contents.index(self)

122

if hasattr(replaceWith, 'parent') and replaceWith.parent == self.parent:

123

# We're replacing this element with one of its siblings.

124

index = self.parent.contents.index(replaceWith)

125

if index and index < myIndex:

126

# Furthermore, it comes before this element. That

127

# means that when we extract it, the index of this

128

# element will change.

129

myIndex = myIndex - 1

130

self.extract()

131

oldParent.insert(myIndex, replaceWith)

132

133

def extract(self):

134

"""Destructively rips this element out of the tree."""

135

if self.parent:

136

try:

137

self.parent.contents.remove(self)

138

except ValueError:

139

pass

140

141

#Find the two elements that would be next to each other if

142

#this element (and any children) hadn't been parsed. Connect

143

#the two.

144

lastChild = self._lastRecursiveChild()

145

nextElement = lastChild.next

146

147

if self.previous:

148

self.previous.next = nextElement

149

if nextElement:

150

nextElement.previous = self.previous

151

self.previous = None

152

lastChild.next = None

153

154

self.parent = None

155

if self.previousSibling:

156

self.previousSibling.nextSibling = self.nextSibling

157

if self.nextSibling:

158

self.nextSibling.previousSibling = self.previousSibling

159

self.previousSibling = self.nextSibling = None

160

161

def _lastRecursiveChild(self):

162

"Finds the last element beneath this object to be parsed."

163

lastChild = self

164

while hasattr(lastChild, 'contents') and lastChild.contents:

165

lastChild = lastChild.contents[-1]

166

return lastChild

167

168

def insert(self, position, newChild):

169

if (isinstance(newChild, basestring)

170

or isinstance(newChild, unicode)) \

171

and not isinstance(newChild, NavigableString):

172

newChild = NavigableString(newChild)

173

174

position = min(position, len(self.contents))

175

if hasattr(newChild, 'parent') and newChild.parent != None:

176

# We're 'inserting' an element that's already one

177

# of this object's children.

178

if newChild.parent == self:

179

index = self.find(newChild)

180

if index and index < position:

181

# Furthermore we're moving it further down the

182

# list of this object's children. That means that

183

# when we extract this element, our target index

184

# will jump down one.

185

position = position - 1

186

newChild.extract()

187

188

newChild.parent = self

189

previousChild = None

190

if position == 0:

191

newChild.previousSibling = None

192

newChild.previous = self

193

else:

194

previousChild = self.contents[position-1]

195

newChild.previousSibling = previousChild

196

newChild.previousSibling.nextSibling = newChild

197

newChild.previous = previousChild._lastRecursiveChild()

198

if newChild.previous:

199

newChild.previous.next = newChild

200

201

newChildsLastElement = newChild._lastRecursiveChild()

202

203

if position >= len(self.contents):

204

newChild.nextSibling = None

205

206

parent = self

207

parentsNextSibling = None

208

while not parentsNextSibling:

209

parentsNextSibling = parent.nextSibling

210

parent = parent.parent

211

if not parent: # This is the last element in the document.

212

break

213

if parentsNextSibling:

214

newChildsLastElement.next = parentsNextSibling

215

else:

216

newChildsLastElement.next = None

217

else:

218

nextChild = self.contents[position]

219

newChild.nextSibling = nextChild

220

if newChild.nextSibling:

221

newChild.nextSibling.previousSibling = newChild

222

newChildsLastElement.next = nextChild

223

224

if newChildsLastElement.next:

225

newChildsLastElement.next.previous = newChildsLastElement

226

self.contents.insert(position, newChild)

227

228

def append(self, tag):

229

"""Appends the given tag to the contents of this tag."""

230

self.insert(len(self.contents), tag)

231

232

def findNext(self, name=None, attrs={}, text=None, **kwargs):

233

"""Returns the first item that matches the given criteria and

234

appears after this Tag in the document."""

235

return self._findOne(self.findAllNext, name, attrs, text, **kwargs)

236

237

def findAllNext(self, name=None, attrs={}, text=None, limit=None,

238

**kwargs):

239

"""Returns all items that match the given criteria and appear

240

before after Tag in the document."""

241

return self._findAll(name, attrs, text, limit, self.nextGenerator)

242

243

def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):

244

"""Returns the closest sibling to this Tag that matches the

245

given criteria and appears after this Tag in the document."""

246

return self._findOne(self.findNextSiblings, name, attrs, text,

247

**kwargs)

248

249

def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,

250

**kwargs):

251

"""Returns the siblings of this Tag that match the given

252

criteria and appear after this Tag in the document."""

253

return self._findAll(name, attrs, text, limit,

254

self.nextSiblingGenerator, **kwargs)

255

fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x

256

257

def findPrevious(self, name=None, attrs={}, text=None, **kwargs):

258

"""Returns the first item that matches the given criteria and

259

appears before this Tag in the document."""

260

return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)

261

262

def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,

263

**kwargs):

264

"""Returns all items that match the given criteria and appear

265

before this Tag in the document."""

266

return self._findAll(name, attrs, text, limit, self.previousGenerator,

267

**kwargs)

268

fetchPrevious = findAllPrevious # Compatibility with pre-3.x

269

270

def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):

271

"""Returns the closest sibling to this Tag that matches the

272

given criteria and appears before this Tag in the document."""

273

return self._findOne(self.findPreviousSiblings, name, attrs, text,

274

**kwargs)

275

276

def findPreviousSiblings(self, name=None, attrs={}, text=None,

277

limit=None, **kwargs):

278

"""Returns the siblings of this Tag that match the given

279

criteria and appear before this Tag in the document."""

280

return self._findAll(name, attrs, text, limit,

281

self.previousSiblingGenerator, **kwargs)

282

fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x

283

284

def findParent(self, name=None, attrs={}, **kwargs):

285

"""Returns the closest parent of this Tag that matches the given

286

criteria."""

287

# NOTE: We can't use _findOne because findParents takes a different

288

# set of arguments.

289

r = None

290

l = self.findParents(name, attrs, 1)

291

if l:

292

r = l[0]

293

return r

294

295

def findParents(self, name=None, attrs={}, limit=None, **kwargs):

296

"""Returns the parents of this Tag that match the given

297

criteria."""

298

299

return self._findAll(name, attrs, None, limit, self.parentGenerator,

300

**kwargs)

301

fetchParents = findParents # Compatibility with pre-3.x

302

303

#These methods do the real heavy lifting.

304

305

def _findOne(self, method, name, attrs, text, **kwargs):

306

r = None

307

l = method(name, attrs, text, 1, **kwargs)

308

if l:

309

r = l[0]

310

return r

311

312

def _findAll(self, name, attrs, text, limit, generator, **kwargs):

313

"Iterates over a generator looking for things that match."

314

315

if isinstance(name, SoupStrainer):

316

strainer = name

317

else:

318

# Build a SoupStrainer

319

strainer = SoupStrainer(name, attrs, text, **kwargs)

320

results = ResultSet(strainer)

321

g = generator()

322

while True:

323

try:

324

i = g.next()

325

except StopIteration:

326

break

327

if i:

328

found = strainer.search(i)

329

if found:

330

results.append(found)

331

if limit and len(results) >= limit:

332

break

333

return results

334

335

#These Generators can be used to navigate starting from both

336

#NavigableStrings and Tags.

337

def nextGenerator(self):

338

i = self

339

while i:

340

i = i.next

341

yield i

342

343

def nextSiblingGenerator(self):

344

i = self

345

while i:

346

i = i.nextSibling

347

yield i

348

349

def previousGenerator(self):

350

i = self

351

while i:

352

i = i.previous

353

yield i

354

355

def previousSiblingGenerator(self):

356

i = self

357

while i:

358

i = i.previousSibling

359

yield i

360

361

def parentGenerator(self):

362

i = self

363

while i:

364

i = i.parent

365

yield i

366

367

# Utility methods

368

def substituteEncoding(self, str, encoding=None):

369

encoding = encoding or "utf-8"

370

return str.replace("%SOUP-ENCODING%", encoding)

371

372

def toEncoding(self, s, encoding=None):

373

"""Encodes an object to a string in some encoding, or to Unicode.

374

."""

375

if isinstance(s, unicode):

376

if encoding:

377

s = s.encode(encoding)

378

elif isinstance(s, str):

379

if encoding:

380

s = s.encode(encoding)

381

else:

382

s = unicode(s)

383

else:

384

if encoding:

385

s = self.toEncoding(str(s), encoding)

386

else:

387

s = unicode(s)

388

return s

389

390

class NavigableString(unicode, PageElement):

391

392

def __getnewargs__(self):

393

return (NavigableString.__str__(self),)

394

395

def __getattr__(self, attr):

396

"""text.string gives you text. This is for backwards

397

compatibility for Navigable*String, but for CData* it lets you

398

get the string without the CData wrapper."""

399

if attr == 'string':

400

return self

401

else:

402

raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)

403

404

def __unicode__(self):

405

return unicode(str(self))

406

407

def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):

408

if encoding:

409

return self.encode(encoding)

410

else:

411

return self

412

413

class CData(NavigableString):

414

415

def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):

416

return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)

417

418

class ProcessingInstruction(NavigableString):

419

def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):

420

output = self

421

if "%SOUP-ENCODING%" in output:

422

output = self.substituteEncoding(output, encoding)

423

return "<?%s?>" % self.toEncoding(output, encoding)

424

425

class Comment(NavigableString):

426

def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):

427

return "" % NavigableString.__str__(self, encoding)

428

429

class Declaration(NavigableString):

430

def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):

431

return "<!%s>" % NavigableString.__str__(self, encoding)

432

433

class Tag(PageElement):

434

435

"""Represents a found HTML tag with its attributes and contents."""

436

437

def _invert(h):

438

"Cheap function to invert a hash."

439

i = {}

440

for k,v in h.items():

441

i[v] = k

442

return i

443

444

XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",

445

"quot" : '"',

446

"amp" : "&",

447

"lt" : "<",

448

"gt" : ">" }

449

450

XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)

451

452

def _convertEntities(self, match):

453

"""Used in a call to re.sub to replace HTML, XML, and numeric

454

entities with the appropriate Unicode characters. If HTML

455

entities are being converted, any unrecognized entities are

456

escaped."""

457

x = match.group(1)

458

if self.convertHTMLEntities and x in name2codepoint:

459

return unichr(name2codepoint[x])

460

elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:

461

if self.convertXMLEntities:

462

return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]

463

else:

464

return u'&%s;' % x

465

elif len(x) > 0 and x[0] == '#':

466

# Handle numeric entities

467

if len(x) > 1 and x[1] == 'x':

468

return unichr(int(x[2:], 16))

469

else:

470

return unichr(int(x[1:]))

471

472

elif self.escapeUnrecognizedEntities:

473

return u'&%s;' % x

474

else:

475

return u'&%s;' % x

476

477

def __init__(self, parser, name, attrs=None, parent=None,

478

previous=None):

479

"Basic constructor."

480

481

# We don't actually store the parser object: that lets extracted

482

# chunks be garbage-collected

483

self.parserClass = parser.__class__

484

self.isSelfClosing = parser.isSelfClosingTag(name)

485

self.name = name

486

if attrs == None:

487

attrs = []

488

self.attrs = attrs

489

self.contents = []

490

self.setup(parent, previous)

491

self.hidden = False

492

self.containsSubstitutions = False

493

self.convertHTMLEntities = parser.convertHTMLEntities

494

self.convertXMLEntities = parser.convertXMLEntities

495

self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities

496

497

# Convert any HTML, XML, or numeric entities in the attribute values.

498

convert = lambda(k, val): (k,

499

re.sub("&(#\d+|#x[0-9a-fA-F]+|\w+);",

500

self._convertEntities,

501

val))

502

self.attrs = map(convert, self.attrs)

503

504

def get(self, key, default=None):

505

"""Returns the value of the 'key' attribute for the tag, or

506

the value given for 'default' if it doesn't have that

507

attribute."""

508

return self._getAttrMap().get(key, default)

509

510

def has_key(self, key):

511

return self._getAttrMap().has_key(key)

512

513

def __getitem__(self, key):

514

"""tag[key] returns the value of the 'key' attribute for the tag,

515

and throws an exception if it's not there."""

516

return self._getAttrMap()[key]

517

518

def __iter__(self):

519

"Iterating over a tag iterates over its contents."

520

return iter(self.contents)

521

522

def __len__(self):

523

"The length of a tag is the length of its list of contents."

524

return len(self.contents)

525

526

def __contains__(self, x):

527

return x in self.contents

528

529

def __nonzero__(self):

530

"A tag is non-None even if it has no contents."

531

return True

532

533

def __setitem__(self, key, value):

534

"""Setting tag[key] sets the value of the 'key' attribute for the

535

tag."""

536

self._getAttrMap()

537

self.attrMap[key] = value

538

found = False

539

for i in range(0, len(self.attrs)):

540

if self.attrs[i][0] == key:

541

self.attrs[i] = (key, value)

542

found = True

543

if not found:

544

self.attrs.append((key, value))

545

self._getAttrMap()[key] = value

546

547

def __delitem__(self, key):

548

"Deleting tag[key] deletes all 'key' attributes for the tag."

549

for item in self.attrs:

550

if item[0] == key:

551

self.attrs.remove(item)

552

#We don't break because bad HTML can define the same

553

#attribute multiple times.

554

self._getAttrMap()

555

if self.attrMap.has_key(key):

556

del self.attrMap[key]

557

558

def __call__(self, *args, **kwargs):

559

"""Calling a tag like a function is the same as calling its

560

findAll() method. Eg. tag('a') returns a list of all the A tags

561

found within this tag."""

562

return apply(self.findAll, args, kwargs)

563

564

def __getattr__(self, tag):

565

#print "Getattr %s.%s" % (self.__class__, tag)

566

if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:

567

return self.find(tag[:-3])

568

elif tag.find('__') != 0:

569

return self.find(tag)

570

raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)

571

572

def __eq__(self, other):

573

"""Returns true iff this tag has the same name, the same attributes,

574

and the same contents (recursively) as the given tag.

575

576

NOTE: right now this will return false if two tags have the

577

same attributes in a different order. Should this be fixed?"""

578

if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):

579

return False

580

for i in range(0, len(self.contents)):

581

if self.contents[i] != other.contents[i]:

582

return False

583

return True

584

585

def __ne__(self, other):

586

"""Returns true iff this tag is not identical to the other tag,

587

as defined in __eq__."""

588

return not self == other

589

590

def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):

591

"""Renders this tag as a string."""

592

return self.__str__(encoding)

593

594

def __unicode__(self):

595

return self.__str__(None)

596

597

BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"

598

+ "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"

599

+ ")")

600

601

def _sub_entity(self, x):

602

"""Used with a regular expression to substitute the

603

appropriate XML entity for an XML special character."""

604

return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"

605

606

def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,

607

prettyPrint=False, indentLevel=0):

608

"""Returns a string or Unicode representation of this tag and

609

its contents. To get Unicode, pass None for encoding.

610

611

NOTE: since Python's HTML parser consumes whitespace, this

612

method is not certain to reproduce the whitespace present in

613

the original string."""

614

615

encodedName = self.toEncoding(self.name, encoding)

616

617

attrs = []

618

if self.attrs:

619

for key, val in self.attrs:

620

fmt = '%s="%s"'

621

if isString(val):

622

if self.containsSubstitutions and '%SOUP-ENCODING%' in val:

623

val = self.substituteEncoding(val, encoding)

624

625

# The attribute value either:

626

627

# * Contains no embedded double quotes or single quotes.

628

# No problem: we enclose it in double quotes.

629

# * Contains embedded single quotes. No problem:

630

# double quotes work here too.

631

# * Contains embedded double quotes. No problem:

632

# we enclose it in single quotes.

633

# * Embeds both single _and_ double quotes. This

634

# can't happen naturally, but it can happen if

635

# you modify an attribute value after parsing

636

# the document. Now we have a bit of a

637

# problem. We solve it by enclosing the

638

# attribute in single quotes, and escaping any

639

# embedded single quotes to XML entities.

640

if '"' in val:

641

fmt = "%s='%s'"

642

if "'" in val:

643

# TODO: replace with apos when

644

# appropriate.

645

val = val.replace("'", "&squot;")

646

647

# Now we're okay w/r/t quotes. But the attribute

648

# value might also contain angle brackets, or

649

# ampersands that aren't part of entities. We need

650

# to escape those to XML entities too.

651

val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)

652

653

attrs.append(fmt % (self.toEncoding(key, encoding),

654

self.toEncoding(val, encoding)))

655

close = ''

656

closeTag = ''

657

if self.isSelfClosing:

658

close = ' /'

659

else:

660

closeTag = '</%s>' % encodedName

661

662

indentTag, indentContents = 0, 0

663

if prettyPrint:

664

indentTag = indentLevel

665

space = (' ' * (indentTag-1))

666

indentContents = indentTag + 1

667

contents = self.renderContents(encoding, prettyPrint, indentContents)

668

if self.hidden:

669

s = contents

670

else:

671

s = []

672

attributeString = ''

673

if attrs:

674

attributeString = ' ' + ' '.join(attrs)

675

if prettyPrint:

676

s.append(space)

677

s.append('<%s%s%s>' % (encodedName, attributeString, close))

678

if prettyPrint:

679

s.append("\n")

680

s.append(contents)

681

if prettyPrint and contents and contents[-1] != "\n":

682

s.append("\n")

683

if prettyPrint and closeTag:

684

s.append(space)

685

s.append(closeTag)

686

if prettyPrint and closeTag and self.nextSibling:

687

s.append("\n")

688

s = ''.join(s)

689

return s

690

691

def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):

692

return self.__str__(encoding, True)

693

694

def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,

695

prettyPrint=False, indentLevel=0):

696

"""Renders the contents of this tag as a string in the given

697

encoding. If encoding is None, returns a Unicode string.."""

698

s=[]

699

for c in self:

700

text = None

701

if isinstance(c, NavigableString):

702

text = c.__str__(encoding)

703

elif isinstance(c, Tag):

704

s.append(c.__str__(encoding, prettyPrint, indentLevel))

705

if text and prettyPrint:

706

text = text.strip()

707

if text:

708

if prettyPrint:

709

s.append(" " * (indentLevel-1))

710

s.append(text)

711

if prettyPrint:

712

s.append("\n")

713

return ''.join(s)

714

715

#Soup methods

716

717

def find(self, name=None, attrs={}, recursive=True, text=None,

718

**kwargs):

719

"""Return only the first child of this Tag matching the given

720

criteria."""

721

r = None

722

l = self.findAll(name, attrs, recursive, text, 1, **kwargs)

723

if l:

724

r = l[0]

725

return r

726

findChild = find

727

728

def findAll(self, name=None, attrs={}, recursive=True, text=None,

729

limit=None, **kwargs):

730

"""Extracts a list of Tag objects that match the given

731

criteria. You can specify the name of the Tag and any

732

attributes you want the Tag to have.

733

734

The value of a key-value pair in the 'attrs' map can be a

735

string, a list of strings, a regular expression object, or a

736

callable that takes a string and returns whether or not the

737

string matches for some custom definition of 'matches'. The

738

same is true of the tag name."""

739

generator = self.recursiveChildGenerator

740

if not recursive:

741

generator = self.childGenerator

742

return self._findAll(name, attrs, text, limit, generator, **kwargs)

743

findChildren = findAll

744

745

# Pre-3.x compatibility methods

746

first = find

747

fetch = findAll

748

749

def fetchText(self, text=None, recursive=True, limit=None):

750

return self.findAll(text=text, recursive=recursive, limit=limit)

751

752

def firstText(self, text=None, recursive=True):

753

return self.find(text=text, recursive=recursive)

754

755

#Private methods

756

757

def _getAttrMap(self):

758

"""Initializes a map representation of this tag's attributes,

759

if not already initialized."""

760

if not getattr(self, 'attrMap'):

761

self.attrMap = {}

762

for (key, value) in self.attrs:

763

self.attrMap[key] = value

764

return self.attrMap

765

766

#Generator methods

767

def childGenerator(self):

768

for i in range(0, len(self.contents)):

769

yield self.contents[i]

770

raise StopIteration

771

772

def recursiveChildGenerator(self):

773

stack = [(self, 0)]

774

while stack:

775

tag, start = stack.pop()

776

if isinstance(tag, Tag):

777

for i in range(start, len(tag.contents)):

778

a = tag.contents[i]

779

yield a

780

if isinstance(a, Tag) and tag.contents:

781

if i < len(tag.contents) - 1:

782

stack.append((tag, i+1))

783

stack.append((a, 0))

784

break

785

raise StopIteration

786

787

# Next, a couple classes to represent queries and their results.

788

class SoupStrainer:

789

"""Encapsulates a number of ways of matching a markup element (tag or

790

text)."""

791

792

def __init__(self, name=None, attrs={}, text=None, **kwargs):

793

self.name = name

794

if isString(attrs):

795

kwargs['class'] = attrs

796

attrs = None

797

if kwargs:

798

if attrs:

799

attrs = attrs.copy()

800

attrs.update(kwargs)

801

else:

802

attrs = kwargs

803

self.attrs = attrs

804

self.text = text

805

806

def __str__(self):

807

if self.text:

808

return self.text

809

else:

810

return "%s|%s" % (self.name, self.attrs)

811

812

def searchTag(self, markupName=None, markupAttrs={}):

813

found = None

814

markup = None

815

if isinstance(markupName, Tag):

816

markup = markupName

817

markupAttrs = markup

818

callFunctionWithTagData = callable(self.name) \

819

and not isinstance(markupName, Tag)

820

821

if (not self.name) \

822

or callFunctionWithTagData \

823

or (markup and self._matches(markup, self.name)) \

824

or (not markup and self._matches(markupName, self.name)):

825

if callFunctionWithTagData:

826

match = self.name(markupName, markupAttrs)

827

else:

828

match = True

829

markupAttrMap = None

830

for attr, matchAgainst in self.attrs.items():

831

if not markupAttrMap:

832

if hasattr(markupAttrs, 'get'):

833

markupAttrMap = markupAttrs

834

else:

835

markupAttrMap = {}

836

for k,v in markupAttrs:

837

markupAttrMap[k] = v

838

attrValue = markupAttrMap.get(attr)

839

if not self._matches(attrValue, matchAgainst):

840

match = False

841

break

842

if match:

843

if markup:

844

found = markup

845

else:

846

found = markupName

847

return found

848

849

def search(self, markup):

850

#print 'looking for %s in %s' % (self, markup)

851

found = None

852

# If given a list of items, scan it for a text element that

853

# matches.

854

if isList(markup) and not isinstance(markup, Tag):

855

for element in markup:

856

if isinstance(element, NavigableString) \

857

and self.search(element):

858

found = element

859

break

860

# If it's a Tag, make sure its name or attributes match.

861

# Don't bother with Tags if we're searching for text.

862

elif isinstance(markup, Tag):

863

if not self.text:

864

found = self.searchTag(markup)

865

# If it's text, make sure the text matches.

866

elif isinstance(markup, NavigableString) or \

867

isString(markup):

868

if self._matches(markup, self.text):

869

found = markup

870

else:

871

raise Exception, "I don't know how to match against a %s" \

872

% markup.__class__

873

return found

874

875

def _matches(self, markup, matchAgainst):

876

#print "Matching %s against %s" % (markup, matchAgainst)

877

result = False

878

if matchAgainst == True and type(matchAgainst) == types.BooleanType:

879

result = markup != None

880

elif callable(matchAgainst):

881

result = matchAgainst(markup)

882

else:

883

#Custom match methods take the tag as an argument, but all

884

#other ways of matching match the tag name as a string.

885

if isinstance(markup, Tag):

886

markup = markup.name

887

if markup and not isString(markup):

888

markup = unicode(markup)

889

#Now we know that chunk is either a string, or None.

890

if hasattr(matchAgainst, 'match'):

891

# It's a regexp object.

892

result = markup and matchAgainst.search(markup)

893

elif isList(matchAgainst):

894

result = markup in matchAgainst

895

elif hasattr(matchAgainst, 'items'):

896

result = markup.has_key(matchAgainst)

897

elif matchAgainst and isString(markup):

898

if isinstance(markup, unicode):

899

matchAgainst = unicode(matchAgainst)

900

else:

901

matchAgainst = str(matchAgainst)

902

903

if not result:

904

result = matchAgainst == markup

905

return result

906

907

class ResultSet(list):

908

"""A ResultSet is just a list that keeps track of the SoupStrainer

909

that created it."""

910

def __init__(self, source):

911

list.__init__([])

912

self.source = source

913

914

# Now, some helper functions.

915

916

def isList(l):

917

"""Convenience method that works with all 2.x versions of Python

918

to determine whether or not something is listlike."""

919

return hasattr(l, '__iter__') \

920

or (type(l) in (types.ListType, types.TupleType))

921

922

def isString(s):

923

"""Convenience method that works with all 2.x versions of Python

924

to determine whether or not something is stringlike."""

925

try:

926

return isinstance(s, unicode) or isinstance(s, basestring)

927

except NameError:

928

return isinstance(s, str)

929

930

def buildTagMap(default, *args):

931

"""Turns a list of maps, lists, or scalars into a single map.

932

Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and

933

NESTING_RESET_TAGS maps out of lists and partial maps."""

934

built = {}

935

for portion in args:

936

if hasattr(portion, 'items'):

937

#It's a map. Merge it.

938

for k,v in portion.items():

939

built[k] = v

940

elif isList(portion):

941

#It's a list. Map each item to the default.

942

for k in portion:

943

built[k] = default

944

else:

945

#It's a scalar. Map it to the default.

946

built[portion] = default

947

return built

948

949

# Now, the parser classes.

950

951

class BeautifulStoneSoup(Tag, SGMLParser):

952

953

"""This class contains the basic parser and search code. It defines

954

a parser that knows nothing about tag behavior except for the

955

following:

956

957

You can't close a tag without closing all the tags it encloses.

958

That is, "<foo><bar></foo>" actually means

959

"<foo><bar></bar></foo>".

960

961

[Another possible explanation is "<foo><bar /></foo>", but since

962

this class defines no SELF_CLOSING_TAGS, it will never use that

963

explanation.]

964

965

This class is useful for parsing XML or made-up markup languages,

966

or when BeautifulSoup makes an assumption counter to what you were

967

expecting."""

968

969

SELF_CLOSING_TAGS = {}

970

NESTABLE_TAGS = {}

971

RESET_NESTING_TAGS = {}

972

QUOTE_TAGS = {}

973

974

MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),

975

lambda x: x.group(1) + ' />'),

976

(re.compile('<!\s+([^<>]*)>'),

977

lambda x: '<!' + x.group(1) + '>')

978

]

979

980

ROOT_TAG_NAME = u'[document]'

981

982

HTML_ENTITIES = "html"

983

XML_ENTITIES = "xml"

984

XHTML_ENTITIES = "xhtml"

985

# TODO: This only exists for backwards-compatibility

986

ALL_ENTITIES = XHTML_ENTITIES

987

988

# Used when determining whether a text node is all whitespace and

989

# can be replaced with a single space. A text node that contains

990

# fancy Unicode spaces (usually non-breaking) should be left

991

# alone.

992

STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }

993

994

def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,

995

markupMassage=True, smartQuotesTo=XML_ENTITIES,

996

convertEntities=None, selfClosingTags=None):

997

"""The Soup object is initialized as the 'root tag', and the

998

provided markup (which can be a string or a file-like object)

999

is fed into the underlying parser.

1000

1001

sgmllib will process most bad HTML, and the BeautifulSoup

1002

class has some tricks for dealing with some HTML that kills

1003

sgmllib, but Beautiful Soup can nonetheless choke or lose data

1004

if your data uses self-closing tags or declarations

1005

incorrectly.

1006

1007

By default, Beautiful Soup uses regexes to sanitize input,

1008

avoiding the vast majority of these problems. If the problems

1009

don't apply to you, pass in False for markupMassage, and

1010

you'll get better performance.

1011

1012

The default parser massage techniques fix the two most common

1013

instances of invalid HTML that choke sgmllib:

1014

1015

<br/> (No space between name of closing tag and tag close)

1016

<! --Comment--> (Extraneous whitespace in declaration)

1017

1018

You can pass in a custom list of (RE object, replace method)

1019

tuples to get Beautiful Soup to scrub your input the way you

1020

want."""

1021

1022

self.parseOnlyThese = parseOnlyThese

1023

self.fromEncoding = fromEncoding

1024

self.smartQuotesTo = smartQuotesTo

1025

self.convertEntities = convertEntities

1026

# Set the rules for how we'll deal with the entities we

1027

# encounter

1028

if self.convertEntities:

1029

# It doesn't make sense to convert encoded characters to

1030

# entities even while you're converting entities to Unicode.

1031

# Just convert it all to Unicode.

1032

self.smartQuotesTo = None

1033

if convertEntities == self.HTML_ENTITIES:

1034

self.convertXMLEntities = False

1035

self.convertHTMLEntities = True

1036

self.escapeUnrecognizedEntities = True

1037

elif convertEntities == self.XHTML_ENTITIES:

1038

self.convertXMLEntities = True

1039

self.convertHTMLEntities = True

1040

self.escapeUnrecognizedEntities = False

1041

elif convertEntities == self.XML_ENTITIES:

1042

self.convertXMLEntities = True

1043

self.convertHTMLEntities = False

1044

self.escapeUnrecognizedEntities = False

1045

else:

1046

self.convertXMLEntities = False

1047

self.convertHTMLEntities = False

1048

self.escapeUnrecognizedEntities = False

1049

1050

self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)

1051

SGMLParser.__init__(self)

1052

1053

if hasattr(markup, 'read'): # It's a file-type object.

1054

markup = markup.read()

1055

self.markup = markup

1056

self.markupMassage = markupMassage

1057

try:

1058

self._feed()

1059

except StopParsing:

1060

pass

1061

self.markup = None # The markup can now be GCed

1062

1063

def convert_charref(self, name):

1064

"""This method fixes a bug in Python's SGMLParser."""

1065

try:

1066

n = int(name)

1067

except ValueError:

1068

return

1069

if not 0 <= n <= 127 : # ASCII ends at 127, not 255

1070

return

1071

return self.convert_codepoint(n)

1072

1073

def _feed(self, inDocumentEncoding=None):

1074

# Convert the document to Unicode.

1075

markup = self.markup

1076

if isinstance(markup, unicode):

1077

if not hasattr(self, 'originalEncoding'):

1078

self.originalEncoding = None

1079

else:

1080

dammit = UnicodeDammit\

1081

(markup, [self.fromEncoding, inDocumentEncoding],

1082

smartQuotesTo=self.smartQuotesTo)

1083

markup = dammit.unicode

1084

self.originalEncoding = dammit.originalEncoding

1085

if markup:

1086

if self.markupMassage:

1087

if not isList(self.markupMassage):

1088

self.markupMassage = self.MARKUP_MASSAGE

1089

for fix, m in self.markupMassage:

1090

markup = fix.sub(m, markup)

1091

# TODO: We get rid of markupMassage so that the

1092

# soup object can be deepcopied later on. Some

1093

# Python installations can't copy regexes. If anyone

1094

# was relying on the existence of markupMassage, this

1095

# might cause problems.

1096

del(self.markupMassage)

1097

self.reset()

1098

1099

SGMLParser.feed(self, markup)

1100

# Close out any unfinished strings and close all the open tags.

1101

self.endData()

1102

while self.currentTag.name != self.ROOT_TAG_NAME:

1103

self.popTag()

1104

1105

def __getattr__(self, methodName):

1106

"""This method routes method call requests to either the SGMLParser

1107

superclass or the Tag superclass, depending on the method name."""

1108

#print "__getattr__ called on %s.%s" % (self.__class__, methodName)

1109

1110

if methodName.find('start_') == 0 or methodName.find('end_') == 0 \

1111

or methodName.find('do_') == 0:

1112

return SGMLParser.__getattr__(self, methodName)

1113

elif methodName.find('__') != 0:

1114

return Tag.__getattr__(self, methodName)

1115

else:

1116

raise AttributeError

1117

1118

def isSelfClosingTag(self, name):

1119

"""Returns true iff the given string is the name of a

1120

self-closing tag according to this parser."""

1121

return self.SELF_CLOSING_TAGS.has_key(name) \

1122

or self.instanceSelfClosingTags.has_key(name)

1123

1124

def reset(self):

1125

Tag.__init__(self, self, self.ROOT_TAG_NAME)

1126

self.hidden = 1

1127

SGMLParser.reset(self)

1128

self.currentData = []

1129

self.currentTag = None

1130

self.tagStack = []

1131

self.quoteStack = []

1132

self.pushTag(self)

1133

1134

def popTag(self):

1135

tag = self.tagStack.pop()

1136

# Tags with just one string-owning child get the child as a

1137

# 'string' property, so that soup.tag.string is shorthand for

1138

# soup.tag.contents[0]

1139

if len(self.currentTag.contents) == 1 and \

1140

isinstance(self.currentTag.contents[0], NavigableString):

1141

self.currentTag.string = self.currentTag.contents[0]

1142

1143

#print "Pop", tag.name

1144

if self.tagStack:

1145

self.currentTag = self.tagStack[-1]

1146

return self.currentTag

1147

1148

def pushTag(self, tag):

1149

#print "Push", tag.name

1150

if self.currentTag:

1151

self.currentTag.contents.append(tag)

1152

self.tagStack.append(tag)

1153

self.currentTag = self.tagStack[-1]

1154

1155

def endData(self, containerClass=NavigableString):

1156

if self.currentData:

1157

currentData = ''.join(self.currentData)

1158

if not currentData.translate(self.STRIP_ASCII_SPACES):

1159

if '\n' in currentData:

1160

currentData = '\n'

1161

else:

1162

currentData = ' '

1163

self.currentData = []

1164

if self.parseOnlyThese and len(self.tagStack) <= 1 and \

1165

(not self.parseOnlyThese.text or \

1166

not self.parseOnlyThese.search(currentData)):

1167

return

1168

o = containerClass(currentData)

1169

o.setup(self.currentTag, self.previous)

1170

if self.previous:

1171

self.previous.next = o

1172

self.previous = o

1173

self.currentTag.contents.append(o)

1174

1175

1176

def _popToTag(self, name, inclusivePop=True):

1177

"""Pops the tag stack up to and including the most recent

1178

instance of the given tag. If inclusivePop is false, pops the tag

1179

stack up to but *not* including the most recent instqance of

1180

the given tag."""

1181

#print "Popping to %s" % name

1182

if name == self.ROOT_TAG_NAME:

1183

return

1184

1185

numPops = 0

1186

mostRecentTag = None

1187

for i in range(len(self.tagStack)-1, 0, -1):

1188

if name == self.tagStack[i].name:

1189

numPops = len(self.tagStack)-i

1190

break

1191

if not inclusivePop:

1192

numPops = numPops - 1

1193

1194

for i in range(0, numPops):

1195

mostRecentTag = self.popTag()

1196

return mostRecentTag

1197

1198

def _smartPop(self, name):

1199

1200

"""We need to pop up to the previous tag of this type, unless

1201

one of this tag's nesting reset triggers comes between this

1202

tag and the previous tag of this type, OR unless this tag is a

1203

generic nesting trigger and another generic nesting trigger

1204

comes between this tag and the previous tag of this type.

1205

1206

Examples:

1207

<p>Foo<b>Bar *<p>* should pop to 'p', not 'b'.

1208

<p>Foo<table>Bar *<p>* should pop to 'table', not 'p'.

1209

<p>Foo<table><tr>Bar *<p>* should pop to 'tr', not 'p'.

1210

1211

<li><ul><li> *<li>* should pop to 'ul', not the first 'li'.

1212

<tr><table><tr> *<tr>* should pop to 'table', not the first 'tr'

1213

<td><tr><td> *<td>* should pop to 'tr', not the first 'td'

1214

"""

1215

1216

nestingResetTriggers = self.NESTABLE_TAGS.get(name)

1217

isNestable = nestingResetTriggers != None

1218

isResetNesting = self.RESET_NESTING_TAGS.has_key(name)

1219

popTo = None

1220

inclusive = True

1221

for i in range(len(self.tagStack)-1, 0, -1):

1222

p = self.tagStack[i]

1223

if (not p or p.name == name) and not isNestable:

1224

#Non-nestable tags get popped to the top or to their

1225

#last occurance.

1226

popTo = name

1227

break

1228

if (nestingResetTriggers != None

1229

and p.name in nestingResetTriggers) \

1230

or (nestingResetTriggers == None and isResetNesting

1231

and self.RESET_NESTING_TAGS.has_key(p.name)):

1232

1233

#If we encounter one of the nesting reset triggers

1234

#peculiar to this tag, or we encounter another tag

1235

#that causes nesting to reset, pop up to but not

1236

#including that tag.

1237

popTo = p.name

1238

inclusive = False

1239

break

1240

p = p.parent

1241

if popTo:

1242

self._popToTag(popTo, inclusive)

1243

1244

def unknown_starttag(self, name, attrs, selfClosing=0):

1245

#print "Start tag %s: %s" % (name, attrs)

1246

if self.quoteStack:

1247

#This is not a real tag.

1248

#print "<%s> is not real!" % name

1249

attrs = ''.join(map(lambda(x, y): ' %s="%s"' % (x, y), attrs))

1250

self.handle_data('<%s%s>' % (name, attrs))

1251

return

1252

self.endData()

1253

1254

if not self.isSelfClosingTag(name) and not selfClosing:

1255

self._smartPop(name)

1256

1257

if self.parseOnlyThese and len(self.tagStack) <= 1 \

1258

and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):

1259

return

1260

1261

tag = Tag(self, name, attrs, self.currentTag, self.previous)

1262

if self.previous:

1263

self.previous.next = tag

1264

self.previous = tag

1265

self.pushTag(tag)

1266

if selfClosing or self.isSelfClosingTag(name):

1267

self.popTag()

1268

if name in self.QUOTE_TAGS:

1269

#print "Beginning quote (%s)" % name

1270

self.quoteStack.append(name)

1271

self.literal = 1

1272

return tag

1273

1274

def unknown_endtag(self, name):

1275

#print "End tag %s" % name

1276

if self.quoteStack and self.quoteStack[-1] != name:

1277

#This is not a real end tag.

1278

#print "</%s> is not real!" % name

1279

self.handle_data('</%s>' % name)

1280

return

1281

self.endData()

1282

self._popToTag(name)

1283

if self.quoteStack and self.quoteStack[-1] == name:

1284

self.quoteStack.pop()

1285

self.literal = (len(self.quoteStack) > 0)

1286

1287

def handle_data(self, data):

1288

self.currentData.append(data)

1289

1290

def _toStringSubclass(self, text, subclass):

1291

"""Adds a certain piece of text to the tree as a NavigableString

1292

subclass."""

1293

self.endData()

1294

self.handle_data(text)

1295

self.endData(subclass)

1296

1297

def handle_pi(self, text):

1298

"""Handle a processing instruction as a ProcessingInstruction

1299

object, possibly one with a %SOUP-ENCODING% slot into which an

1300

encoding will be plugged later."""

1301

if text[:3] == "xml":

1302

text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"

1303

self._toStringSubclass(text, ProcessingInstruction)

1304

1305

def handle_comment(self, text):

1306

"Handle comments as Comment objects."

1307

self._toStringSubclass(text, Comment)

1308

1309

def handle_charref(self, ref):

1310

"Handle character references as data."

1311

if self.convertEntities:

1312

data = unichr(int(ref))

1313

else:

1314

data = '&#%s;' % ref

1315

self.handle_data(data)

1316

1317

def handle_entityref(self, ref):

1318

"""Handle entity references as data, possibly converting known

1319

HTML and/or XML entity references to the corresponding Unicode

1320

characters."""

1321

data = None

1322

if self.convertHTMLEntities:

1323

try:

1324

data = unichr(name2codepoint[ref])

1325

except KeyError:

1326

pass

1327

1328

if not data and self.convertXMLEntities:

1329

data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)

1330

1331

if not data and self.convertHTMLEntities and \

1332

not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):

1333

# TODO: We've got a problem here. We're told this is

1334

# an entity reference, but it's not an XML entity

1335

# reference or an HTML entity reference. Nonetheless,

1336

# the logical thing to do is to pass it through as an

1337

# unrecognized entity reference.

1338

1339

# Except: when the input is "&carol;" this function

1340

# will be called with input "carol". When the input is

1341

# "AT&T", this function will be called with input

1342

# "T". We have no way of knowing whether a semicolon

1343

# was present originally, so we don't know whether

1344

# this is an unknown entity or just a misplaced

1345

# ampersand.

1346

1347

# The more common case is a misplaced ampersand, so I

1348

# escape the ampersand and omit the trailing semicolon.

1349

data = "&%s" % ref

1350

if not data:

1351

# This case is different from the one above, because we

1352

# haven't already gone through a supposedly comprehensive

1353

# mapping of entities to Unicode characters. We might not

1354

# have gone through any mapping at all. So the chances are

1355

# very high that this is a real entity, and not a

1356

# misplaced ampersand.

1357

data = "&%s;" % ref

1358

self.handle_data(data)

1359

1360

def handle_decl(self, data):

1361

"Handle DOCTYPEs and the like as Declaration objects."

1362

self._toStringSubclass(data, Declaration)

1363

1364

def parse_declaration(self, i):

1365

"""Treat a bogus SGML declaration as raw data. Treat a CDATA

1366

declaration as a CData object."""

1367

j = None

1368

if self.rawdata[i:i+9] == '<![CDATA[':

1369

k = self.rawdata.find(']]>', i)

1370

if k == -1:

1371

k = len(self.rawdata)

1372

data = self.rawdata[i+9:k]

1373

j = k+3

1374

self._toStringSubclass(data, CData)

1375

else:

1376

try:

1377

j = SGMLParser.parse_declaration(self, i)

1378

except SGMLParseError:

1379

toHandle = self.rawdata[i:]

1380

self.handle_data(toHandle)

1381

j = i + len(toHandle)

1382

return j

1383

1384

class BeautifulSoup(BeautifulStoneSoup):

1385

1386

"""This parser knows the following facts about HTML:

1387

1388

* Some tags have no closing tag and should be interpreted as being

1389

closed as soon as they are encountered.

1390

1391

* The text inside some tags (ie. 'script') may contain tags which

1392

are not really part of the document and which should be parsed

1393

as text, not tags. If you want to parse the text as tags, you can

1394

always fetch it and parse it explicitly.

1395

1396

* Tag nesting rules:

1397

1398

Most tags can't be nested at all. For instance, the occurance of

1399

a <p> tag should implicitly close the previous <p> tag.

1400

1401

<p>Para1<p>Para2

1402

should be transformed into:

1403

<p>Para1</p><p>Para2

1404

1405

Some tags can be nested arbitrarily. For instance, the occurance

1406

of a <blockquote> tag should _not_ implicitly close the previous

1407

<blockquote> tag.

1408

1409

Alice said: <blockquote>Bob said: <blockquote>Blah

1410

should NOT be transformed into:

1411

Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah

1412

1413

Some tags can be nested, but the nesting is reset by the

1414

interposition of other tags. For instance, a <tr> tag should

1415

implicitly close the previous <tr> tag within the same <table>,

1416

but not close a <tr> tag in another table.

1417

1418

<table><tr>Blah<tr>Blah

1419

should be transformed into:

1420

<table><tr>Blah</tr><tr>Blah

1421

but,

1422

<tr>Blah<table><tr>Blah

1423

should NOT be transformed into

1424

<tr>Blah<table></tr><tr>Blah

1425

1426

Differing assumptions about tag nesting rules are a major source

1427

of problems with the BeautifulSoup class. If BeautifulSoup is not

1428

treating as nestable a tag your page author treats as nestable,

1429

try ICantBelieveItsBeautifulSoup, MinimalSoup, or

1430

BeautifulStoneSoup before writing your own subclass."""

1431

1432

def __init__(self, *args, **kwargs):

1433

if not kwargs.has_key('smartQuotesTo'):

1434

kwargs['smartQuotesTo'] = self.HTML_ENTITIES

1435

BeautifulStoneSoup.__init__(self, *args, **kwargs)

1436

1437

SELF_CLOSING_TAGS = buildTagMap(None,

1438

['br' , 'hr', 'input', 'img', 'meta',

1439

'spacer', 'link', 'frame', 'base'])

1440

1441

QUOTE_TAGS = {'script' : None, 'textarea' : None}

1442

1443

#According to the HTML standard, each of these inline tags can

1444

#contain another tag of the same type. Furthermore, it's common

1445

#to actually use these tags this way.

1446

NESTABLE_INLINE_TAGS = ['span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',

1447

'center']

1448

1449

#According to the HTML standard, these block tags can contain

1450

#another tag of the same type. Furthermore, it's common

1451

#to actually use these tags this way.

1452

NESTABLE_BLOCK_TAGS = ['blockquote', 'div', 'fieldset', 'ins', 'del']

1453

1454

#Lists can contain other lists, but there are restrictions.

1455

NESTABLE_LIST_TAGS = { 'ol' : [],

1456

'ul' : [],

1457

'li' : ['ul', 'ol'],

1458

'dl' : [],

1459

'dd' : ['dl'],

1460

'dt' : ['dl'] }

1461

1462

#Tables can contain other tables, but there are restrictions.

1463

NESTABLE_TABLE_TAGS = {'table' : [],

1464

'tr' : ['table', 'tbody', 'tfoot', 'thead'],

1465

'td' : ['tr'],

1466

'th' : ['tr'],

1467

'thead' : ['table'],

1468

'tbody' : ['table'],

1469

'tfoot' : ['table'],

1470

}

1471

1472

NON_NESTABLE_BLOCK_TAGS = ['address', 'form', 'p', 'pre']

1473

1474

#If one of these tags is encountered, all tags up to the next tag of

1475

#this type are popped.

1476

RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',

1477

NON_NESTABLE_BLOCK_TAGS,

1478

NESTABLE_LIST_TAGS,

1479

NESTABLE_TABLE_TAGS)

1480

1481

NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,

1482

NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)

1483

1484

# Used to detect the charset in a META tag; see start_meta

1485

CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)")

1486

1487

def start_meta(self, attrs):

1488

"""Beautiful Soup can detect a charset included in a META tag,

1489

try to convert the document to that charset, and re-parse the

1490

document from the beginning."""

1491

httpEquiv = None

1492

contentType = None

1493

contentTypeIndex = None

1494

tagNeedsEncodingSubstitution = False

1495

1496

for i in range(0, len(attrs)):

1497

key, value = attrs[i]

1498

key = key.lower()

1499

if key == 'http-equiv':

1500

httpEquiv = value

1501

elif key == 'content':

1502

contentType = value

1503

contentTypeIndex = i

1504

1505

if httpEquiv and contentType: # It's an interesting meta tag.

1506

match = self.CHARSET_RE.search(contentType)

1507

if match:

1508

if getattr(self, 'declaredHTMLEncoding') or \

1509

(self.originalEncoding == self.fromEncoding):

1510

# This is our second pass through the document, or

1511

# else an encoding was specified explicitly and it

1512

# worked. Rewrite the meta tag.

1513

newAttr = self.CHARSET_RE.sub\

1514

(lambda(match):match.group(1) +

1515

"%SOUP-ENCODING%", value)

1516

attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],

1517

newAttr)

1518

tagNeedsEncodingSubstitution = True

1519

else:

1520

# This is our first pass through the document.

1521

# Go through it again with the new information.

1522

newCharset = match.group(3)

1523

if newCharset and newCharset != self.originalEncoding:

1524

self.declaredHTMLEncoding = newCharset

1525

self._feed(self.declaredHTMLEncoding)

1526

raise StopParsing

1527

tag = self.unknown_starttag("meta", attrs)

1528

if tag and tagNeedsEncodingSubstitution:

1529

tag.containsSubstitutions = True

1530

1531

class StopParsing(Exception):

1532

pass

1533

1534

class ICantBelieveItsBeautifulSoup(BeautifulSoup):

1535

1536

"""The BeautifulSoup class is oriented towards skipping over

1537

common HTML errors like unclosed tags. However, sometimes it makes

1538

errors of its own. For instance, consider this fragment:

1539

1540

1541

1542

This is perfectly valid (if bizarre) HTML. However, the

1543

BeautifulSoup class will implicitly close the first b tag when it

1544

encounters the second 'b'. It will think the author wrote

1545

"<b>Foo<b>Bar", and didn't close the first 'b' tag, because

1546

there's no real-world reason to bold something that's already

1547

bold. When it encounters '</b></b>' it will close two more 'b'

1548

tags, for a grand total of three tags closed instead of two. This

1549

can throw off the rest of your document structure. The same is

1550

true of a number of other tags, listed below.

1551

1552

It's much more common for someone to forget to close a 'b' tag

1553

than to actually use nested 'b' tags, and the BeautifulSoup class

1554

handles the common case. This class handles the not-co-common

1555

case: where you can't believe someone wrote what they did, but

1556

it's valid HTML and BeautifulSoup screwed up by assuming it

1557

wouldn't be."""

1558

1559

I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \

1560

['em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',

1561

'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',

1562

'big']

1563

1564

I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ['noscript']

1565

1566

NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,

1567

I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,

1568

I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)

1569

1570

class MinimalSoup(BeautifulSoup):

1571

"""The MinimalSoup class is for parsing HTML that contains

1572

pathologically bad markup. It makes no assumptions about tag

1573

nesting, but it does know which tags are self-closing, that

1574

1575

META tags may contain encoding information, and so on.

1576

1577

This also makes it better for subclassing than BeautifulStoneSoup

1578

or BeautifulSoup."""

1579

1580

RESET_NESTING_TAGS = buildTagMap('noscript')

1581

NESTABLE_TAGS = {}

1582

1583

class BeautifulSOAP(BeautifulStoneSoup):

1584

"""This class will push a tag with only a single string child into

1585

the tag's parent as an attribute. The attribute's name is the tag

1586

name, and the value is the string child. An example should give

1587

the flavor of the change:

1588

1589

1590

1591

1592

1593

You can then access fooTag['bar'] instead of fooTag.barTag.string.

1594

1595

This is, of course, useful for scraping structures that tend to

1596

use subelements instead of attributes, such as SOAP messages. Note

1597

that it modifies its input, so don't print the modified version

1598

out.

1599

1600

I'm not sure how many people really want to use this class; let me

1601

know if you do. Mainly I like the name."""

1602

1603

def popTag(self):

1604

if len(self.tagStack) > 1:

1605

tag = self.tagStack[-1]

1606

parent = self.tagStack[-2]

1607

parent._getAttrMap()

1608

if (isinstance(tag, Tag) and len(tag.contents) == 1 and

1609

isinstance(tag.contents[0], NavigableString) and

1610

not parent.attrMap.has_key(tag.name)):

1611

parent[tag.name] = tag.contents[0]

1612

BeautifulStoneSoup.popTag(self)

1613

1614

#Enterprise class names! It has come to our attention that some people

1615

#think the names of the Beautiful Soup parser classes are too silly

1616

#and "unprofessional" for use in enterprise screen-scraping. We feel

1617

#your pain! For such-minded folk, the Beautiful Soup Consortium And

1618

#All-Night Kosher Bakery recommends renaming this file to

1619

#"RobustParser.py" (or, in cases of extreme enterprisiness,

1620

#"RobustParserBeanInterface.class") and using the following

1621

#enterprise-friendly class aliases:

1622

class RobustXMLParser(BeautifulStoneSoup):

1623

pass

1624

class RobustHTMLParser(BeautifulSoup):

1625

pass

1626

class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):

1627

pass

1628

class RobustInsanelyWackAssHTMLParser(MinimalSoup):

1629

pass

1630

class SimplifyingSOAPParser(BeautifulSOAP):

1631

pass

1632

1633

######################################################

1634

1635

# Bonus library: Unicode, Dammit

1636

1637

# This class forces XML data into a standard format (usually to UTF-8

1638

# or Unicode). It is heavily based on code from Mark Pilgrim's

1639

# Universal Feed Parser. It does not rewrite the XML or HTML to

1640

# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi

1641

# (XML) and BeautifulSoup.start_meta (HTML).

1642

1643

# Autodetects character encodings.

1644

# Download from http://chardet.feedparser.org/

1645

try:

1646

import chardet

1647

# import chardet.constants

1648

# chardet.constants._debug = 1

1649

except:

1650

chardet = None

1651

1652

# cjkcodecs and iconv_codec make Python know about more character encodings.

1653

# Both are available from http://cjkpython.i18n.org/

1654

# They're built in if you use Python 2.4.

1655

try:

1656

import cjkcodecs.aliases

1657

except:

1658

pass

1659

try:

1660

import iconv_codec

1661

except:

1662

pass

1663

1664

class UnicodeDammit:

1665

"""A class for detecting the encoding of a *ML document and

1666

converting it to a Unicode string. If the source encoding is

1667

windows-1252, can replace MS smart quotes with their HTML or XML

1668

equivalents."""

1669

1670

# This dictionary maps commonly seen values for "charset" in HTML

1671

# meta tags to the corresponding Python codec names. It only covers

1672

# values that aren't in Python's aliases and can't be determined

1673

# by the heuristics in find_codec.

1674

CHARSET_ALIASES = { "macintosh" : "mac-roman",

1675

"x-sjis" : "shift-jis" }

1676

1677

def __init__(self, markup, overrideEncodings=[],

1678

smartQuotesTo='xml'):

1679

self.markup, documentEncoding, sniffedEncoding = \

1680

self._detectEncoding(markup)

1681

self.smartQuotesTo = smartQuotesTo

1682

self.triedEncodings = []

1683

if markup == '' or isinstance(markup, unicode):

1684

self.originalEncoding = None

1685

self.unicode = unicode(markup)

1686

return

1687

1688

u = None

1689

for proposedEncoding in overrideEncodings:

1690

u = self._convertFrom(proposedEncoding)

1691

if u: break

1692

if not u:

1693

for proposedEncoding in (documentEncoding, sniffedEncoding):

1694

u = self._convertFrom(proposedEncoding)

1695

if u: break

1696

1697

# If no luck and we have auto-detection library, try that:

1698

if not u and chardet and not isinstance(self.markup, unicode):

1699

u = self._convertFrom(chardet.detect(self.markup)['encoding'])

1700

1701

# As a last resort, try utf-8 and windows-1252:

1702

if not u:

1703

for proposed_encoding in ("utf-8", "windows-1252"):

1704

u = self._convertFrom(proposed_encoding)

1705

if u: break

1706

self.unicode = u

1707

if not u: self.originalEncoding = None

1708

1709

def _subMSChar(self, orig):

1710

"""Changes a MS smart quote character to an XML or HTML

1711

entity."""

1712

sub = self.MS_CHARS.get(orig)

1713

if type(sub) == types.TupleType:

1714

if self.smartQuotesTo == 'xml':

1715

sub = '&#x%s;' % sub[1]

1716

else:

1717

sub = '&%s;' % sub[0]

1718

return sub

1719

1720

def _convertFrom(self, proposed):

1721

proposed = self.find_codec(proposed)

1722

if not proposed or proposed in self.triedEncodings:

1723

return None

1724

self.triedEncodings.append(proposed)

1725

markup = self.markup

1726

1727

# Convert smart quotes to HTML if coming from an encoding

1728

# that might have them.

1729

if self.smartQuotesTo and proposed.lower() in("windows-1252",

1730

"iso-8859-1",

1731

"iso-8859-2"):

1732

markup = re.compile("([\x80-\x9f])").sub \

1733

(lambda(x): self._subMSChar(x.group(1)),

1734

markup)

1735

1736

try:

1737

# print "Trying to convert document to %s" % proposed

1738

u = self._toUnicode(markup, proposed)

1739

self.markup = u

1740

self.originalEncoding = proposed

1741

except Exception, e:

1742

# print "That didn't work!"

1743

# print e

1744

return None

1745

#print "Correct encoding: %s" % proposed

1746

return self.markup

1747

1748

def _toUnicode(self, data, encoding):

1749

'''Given a string and its encoding, decodes the string into Unicode.

1750

%encoding is a string recognized by encodings.aliases'''

1751

1752

# strip Byte Order Mark (if present)

1753

if (len(data) >= 4) and (data[:2] == '\xfe\xff') \

1754

and (data[2:4] != '\x00\x00'):

1755

encoding = 'utf-16be'

1756

data = data[2:]

1757

elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \

1758

and (data[2:4] != '\x00\x00'):

1759

encoding = 'utf-16le'

1760

data = data[2:]

1761

elif data[:3] == '\xef\xbb\xbf':

1762

encoding = 'utf-8'

1763

data = data[3:]

1764

elif data[:4] == '\x00\x00\xfe\xff':

1765

encoding = 'utf-32be'

1766

data = data[4:]

1767

elif data[:4] == '\xff\xfe\x00\x00':

1768

encoding = 'utf-32le'

1769

data = data[4:]

1770

newdata = unicode(data, encoding)

1771

return newdata

1772

1773

def _detectEncoding(self, xml_data):

1774

"""Given a document, tries to detect its XML encoding."""

1775

xml_encoding = sniffed_xml_encoding = None

1776

try:

1777

if xml_data[:4] == '\x4c\x6f\xa7\x94':

1778

# EBCDIC

1779

xml_data = self._ebcdic_to_ascii(xml_data)

1780

elif xml_data[:4] == '\x00\x3c\x00\x3f':

1781

# UTF-16BE

1782

sniffed_xml_encoding = 'utf-16be'

1783

xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')

1784

elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \

1785

and (xml_data[2:4] != '\x00\x00'):

1786

# UTF-16BE with BOM

1787

sniffed_xml_encoding = 'utf-16be'

1788

xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')

1789

elif xml_data[:4] == '\x3c\x00\x3f\x00':

1790

# UTF-16LE

1791

sniffed_xml_encoding = 'utf-16le'

1792

xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')

1793

elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \

1794

(xml_data[2:4] != '\x00\x00'):

1795

# UTF-16LE with BOM

1796

sniffed_xml_encoding = 'utf-16le'

1797

xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')

1798

elif xml_data[:4] == '\x00\x00\x00\x3c':

1799

# UTF-32BE

1800

sniffed_xml_encoding = 'utf-32be'

1801

xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')

1802

elif xml_data[:4] == '\x3c\x00\x00\x00':

1803

# UTF-32LE

1804

sniffed_xml_encoding = 'utf-32le'

1805

xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')

1806

elif xml_data[:4] == '\x00\x00\xfe\xff':

1807

# UTF-32BE with BOM

1808

sniffed_xml_encoding = 'utf-32be'

1809

xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')

1810

elif xml_data[:4] == '\xff\xfe\x00\x00':

1811

# UTF-32LE with BOM

1812

sniffed_xml_encoding = 'utf-32le'

1813

xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')

1814

elif xml_data[:3] == '\xef\xbb\xbf':

1815

# UTF-8 with BOM

1816

sniffed_xml_encoding = 'utf-8'

1817

xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')

1818

else:

1819

sniffed_xml_encoding = 'ascii'

1820

pass

1821

xml_encoding_match = re.compile \

1822

('^<\?.*encoding=[\'"](.*?)[\'"].*\?>')\

1823

.match(xml_data)

1824

except:

1825

xml_encoding_match = None

1826

if xml_encoding_match:

1827

xml_encoding = xml_encoding_match.groups()[0].lower()

1828

if sniffed_xml_encoding and \

1829

(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',

1830

'iso-10646-ucs-4', 'ucs-4', 'csucs4',

1831

'utf-16', 'utf-32', 'utf_16', 'utf_32',

1832

'utf16', 'u16')):

1833

xml_encoding = sniffed_xml_encoding

1834

return xml_data, xml_encoding, sniffed_xml_encoding

1835

1836

1837

def find_codec(self, charset):

1838

return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \

1839

or (charset and self._codec(charset.replace("-", ""))) \

1840

or (charset and self._codec(charset.replace("-", "_"))) \

1841

or charset

1842

1843

def _codec(self, charset):

1844

if not charset: return charset

1845

codec = None

1846

try:

1847

codecs.lookup(charset)

1848

codec = charset

1849

except (LookupError, ValueError):

1850

pass

1851

return codec

1852

1853

EBCDIC_TO_ASCII_MAP = None

1854

def _ebcdic_to_ascii(self, s):

1855

c = self.__class__

1856

if not c.EBCDIC_TO_ASCII_MAP:

1857

emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,

1858

16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,

1859

128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,

1860

144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,

1861

32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,

1862

38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,

1863

45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,

1864

186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,

1865

195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,

1866

201,202,106,107,108,109,110,111,112,113,114,203,204,205,

1867

206,207,208,209,126,115,116,117,118,119,120,121,122,210,

1868

211,212,213,214,215,216,217,218,219,220,221,222,223,224,

1869

225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,

1870

73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,

1871

82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,

1872

90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,

1873

250,251,252,253,254,255)

1874

import string

1875

c.EBCDIC_TO_ASCII_MAP = string.maketrans( \

1876

''.join(map(chr, range(256))), ''.join(map(chr, emap)))

1877

return s.translate(c.EBCDIC_TO_ASCII_MAP)

1878

1879

MS_CHARS = { '\x80' : ('euro', '20AC'),

1880

'\x81' : ' ',

1881

'\x82' : ('sbquo', '201A'),

1882

'\x83' : ('fnof', '192'),

1883

'\x84' : ('bdquo', '201E'),

1884

'\x85' : ('hellip', '2026'),

1885

'\x86' : ('dagger', '2020'),

1886

'\x87' : ('Dagger', '2021'),

1887

'\x88' : ('circ', '2C6'),

1888

'\x89' : ('permil', '2030'),

1889

'\x8A' : ('Scaron', '160'),

1890

'\x8B' : ('lsaquo', '2039'),

1891

'\x8C' : ('OElig', '152'),

1892

'\x8D' : '?',

1893

'\x8E' : ('#x17D', '17D'),

1894

'\x8F' : '?',

1895

'\x90' : '?',

1896

'\x91' : ('lsquo', '2018'),

1897

'\x92' : ('rsquo', '2019'),

1898

'\x93' : ('ldquo', '201C'),

1899

'\x94' : ('rdquo', '201D'),

1900

'\x95' : ('bull', '2022'),

1901

'\x96' : ('ndash', '2013'),

1902

'\x97' : ('mdash', '2014'),

1903

'\x98' : ('tilde', '2DC'),

1904

'\x99' : ('trade', '2122'),

1905

'\x9a' : ('scaron', '161'),

1906

'\x9b' : ('rsaquo', '203A'),

1907

'\x9c' : ('oelig', '153'),

1908

'\x9d' : '?',

1909

'\x9e' : ('#x17E', '17E'),

1910

'\x9f' : ('Yuml', ''),}

1911

1912

#######################################################################

1913

1914

1915

#By default, act as an HTML pretty-printer.

1916

if __name__ == '__main__':

1917

import sys

1918

soup = BeautifulSoup(sys.stdin.read())

1919

print soup.prettify()

Older »