~guillaume86/exaile/context-dev : revision 2157

1

#!/usr/bin/env python

2

"""Universal feed parser

3

4

Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds

5

6

Visit http://feedparser.org/ for the latest version

7

Visit http://feedparser.org/docs/ for the latest documentation

8

9

Required: Python 2.1 or later

10

Recommended: Python 2.3 or later

11

Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>

12

"""

13

14

__version__ = "4.1"# + "$Revision: 1.92 $"[11:15] + "-cvs"

15

16

17

Redistribution and use in source and binary forms, with or without modification,

18

are permitted provided that the following conditions are met:

19

20

* Redistributions of source code must retain the above copyright notice,

21

this list of conditions and the following disclaimer.

22

* Redistributions in binary form must reproduce the above copyright notice,

23

this list of conditions and the following disclaimer in the documentation

24

and/or other materials provided with the distribution.

25

26

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'

27

AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

28

IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

29

ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE

30

LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR

31

CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF

32

SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS

33

INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN

34

CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)

35

ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

36

POSSIBILITY OF SUCH DAMAGE."""

37

__author__ = "Mark Pilgrim <http://diveintomark.org/>"

38

__contributors__ = ["Jason Diamond <http://injektilo.org/>",

39

"John Beimler <http://john.beimler.org/>",

40

"Fazal Majid <http://www.majid.info/mylos/weblog/>",

41

"Aaron Swartz <http://aaronsw.com/>",

42

"Kevin Marks <http://epeus.blogspot.com/>"]

43

_debug = 0

44

45

# HTTP "User-Agent" header to send to servers when downloading feeds.

46

# If you are embedding feedparser in a larger application, you should

47

# change this to your application name and URL.

48

USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__

49

50

# HTTP "Accept" header to send to servers when downloading feeds. If you don't

51

# want to send an Accept header, set this to None.

52

ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"

53

54

# List of preferred XML parsers, by SAX driver name. These will be tried first,

55

# but if they're not installed, Python will keep searching through its own list

56

# of pre-installed parsers until it finds one that supports everything we need.

57

PREFERRED_XML_PARSERS = ["drv_libxml2"]

58

59

# If you want feedparser to automatically run HTML markup through HTML Tidy, set

60

# this to 1. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>

61

# or utidylib <http://utidylib.berlios.de/>.

62

TIDY_MARKUP = 0

63

64

# List of Python interfaces for HTML Tidy, in order of preference. Only useful

65

# if TIDY_MARKUP = 1

66

PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]

67

68

# ---------- required modules (should come with any Python distribution) ----------

69

import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2

70

try:

71

from cStringIO import StringIO as _StringIO

72

except:

73

from StringIO import StringIO as _StringIO

74

75

# ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------

76

77

# gzip is included with most Python distributions, but may not be available if you compiled your own

78

try:

79

import gzip

80

except:

81

gzip = None

82

try:

83

import zlib

84

except:

85

zlib = None

86

87

# If a real XML parser is available, feedparser will attempt to use it. feedparser has

88

# been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the

89

# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some

90

# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.

91

try:

92

import xml.sax

93

xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers

94

from xml.sax.saxutils import escape as _xmlescape

95

_XML_AVAILABLE = 1

96

except:

97

_XML_AVAILABLE = 0

98

def _xmlescape(data):

99

data = data.replace('&', '&')

100

data = data.replace('>', '>')

101

data = data.replace('<', '<')

102

return data

103

104

# base64 support for Atom feeds that contain embedded binary data

105

try:

106

import base64, binascii

107

except:

108

base64 = binascii = None

109

110

# cjkcodecs and iconv_codec provide support for more character encodings.

111

# Both are available from http://cjkpython.i18n.org/

112

try:

113

import cjkcodecs.aliases

114

except:

115

pass

116

try:

117

import iconv_codec

118

except:

119

pass

120

121

# chardet library auto-detects character encodings

122

# Download from http://chardet.feedparser.org/

123

try:

124

import chardet

125

if _debug:

126

import chardet.constants

127

chardet.constants._debug = 1

128

except:

129

chardet = None

130

131

# ---------- don't touch these ----------

132

class ThingsNobodyCaresAboutButMe(Exception): pass

133

class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass

134

class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass

135

class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass

136

class UndeclaredNamespace(Exception): pass

137

138

sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')

139

sgmllib.special = re.compile('<!')

140

sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]')

141

142

SUPPORTED_VERSIONS = {'': 'unknown',

143

'rss090': 'RSS 0.90',

144

'rss091n': 'RSS 0.91 (Netscape)',

145

'rss091u': 'RSS 0.91 (Userland)',

146

'rss092': 'RSS 0.92',

147

'rss093': 'RSS 0.93',

148

'rss094': 'RSS 0.94',

149

'rss20': 'RSS 2.0',

150

'rss10': 'RSS 1.0',

151

'rss': 'RSS (unknown version)',

152

'atom01': 'Atom 0.1',

153

'atom02': 'Atom 0.2',

154

'atom03': 'Atom 0.3',

155

'atom10': 'Atom 1.0',

156

'atom': 'Atom (unknown version)',

157

'cdf': 'CDF',

158

'hotrss': 'Hot RSS'

159

}

160

161

try:

162

UserDict = dict

163

except NameError:

164

# Python 2.1 does not have dict

165

from UserDict import UserDict

166

def dict(aList):

167

rc = {}

168

for k, v in aList:

169

rc[k] = v

170

return rc

171

172

class FeedParserDict(UserDict):

173

keymap = {'channel': 'feed',

174

'items': 'entries',

175

'guid': 'id',

176

'date': 'updated',

177

'date_parsed': 'updated_parsed',

178

'description': ['subtitle', 'summary'],

179

'url': ['href'],

180

'modified': 'updated',

181

'modified_parsed': 'updated_parsed',

182

'issued': 'published',

183

'issued_parsed': 'published_parsed',

184

'copyright': 'rights',

185

'copyright_detail': 'rights_detail',

186

'tagline': 'subtitle',

187

'tagline_detail': 'subtitle_detail'}

188

def __getitem__(self, key):

189

if key == 'category':

190

return UserDict.__getitem__(self, 'tags')[0]['term']

191

if key == 'categories':

192

return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]

193

realkey = self.keymap.get(key, key)

194

if type(realkey) == types.ListType:

195

for k in realkey:

196

if UserDict.has_key(self, k):

197

return UserDict.__getitem__(self, k)

198

if UserDict.has_key(self, key):

199

return UserDict.__getitem__(self, key)

200

return UserDict.__getitem__(self, realkey)

201

202

def __setitem__(self, key, value):

203

for k in self.keymap.keys():

204

if key == k:

205

key = self.keymap[k]

206

if type(key) == types.ListType:

207

key = key[0]

208

return UserDict.__setitem__(self, key, value)

209

210

def get(self, key, default=None):

211

if self.has_key(key):

212

return self[key]

213

else:

214

return default

215

216

def setdefault(self, key, value):

217

if not self.has_key(key):

218

self[key] = value

219

return self[key]

220

221

def has_key(self, key):

222

try:

223

return hasattr(self, key) or UserDict.has_key(self, key)

224

except AttributeError:

225

return False

226

227

def __getattr__(self, key):

228

try:

229

return self.__dict__[key]

230

except KeyError:

231

pass

232

try:

233

assert not key.startswith('_')

234

return self.__getitem__(key)

235

except:

236

raise AttributeError, "object has no attribute '%s'" % key

237

238

def __setattr__(self, key, value):

239

if key.startswith('_') or key == 'data':

240

self.__dict__[key] = value

241

else:

242

return self.__setitem__(key, value)

243

244

def __contains__(self, key):

245

return self.has_key(key)

246

247

def zopeCompatibilityHack():

248

global FeedParserDict

249

del FeedParserDict

250

def FeedParserDict(aDict=None):

251

rc = {}

252

if aDict:

253

rc.update(aDict)

254

return rc

255

256

_ebcdic_to_ascii_map = None

257

def _ebcdic_to_ascii(s):

258

global _ebcdic_to_ascii_map

259

if not _ebcdic_to_ascii_map:

260

emap = (

261

0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,

262

16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,

263

128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,

264

144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,

265

32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,

266

38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,

267

45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,

268

186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,

269

195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,

270

202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,

271

209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,

272

216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,

273

123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,

274

125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,

275

92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,

276

48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255

277

)

278

import string

279

_ebcdic_to_ascii_map = string.maketrans( \

280

''.join(map(chr, range(256))), ''.join(map(chr, emap)))

281

return s.translate(_ebcdic_to_ascii_map)

282

283

_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')

284

def _urljoin(base, uri):

285

uri = _urifixer.sub(r'\1\3', uri)

286

return urlparse.urljoin(base, uri)

287

288

class _FeedParserMixin:

289

namespaces = {'': '',

290

'http://backend.userland.com/rss': '',

291

'http://blogs.law.harvard.edu/tech/rss': '',

292

'http://purl.org/rss/1.0/': '',

293

'http://my.netscape.com/rdf/simple/0.9/': '',

294

'http://example.com/newformat#': '',

295

'http://example.com/necho': '',

296

'http://purl.org/echo/': '',

297

'uri/of/echo/namespace#': '',

298

'http://purl.org/pie/': '',

299

'http://purl.org/atom/ns#': '',

300

'http://www.w3.org/2005/Atom': '',

301

'http://purl.org/rss/1.0/modules/rss091#': '',

302

303

'http://webns.net/mvcb/': 'admin',

304

'http://purl.org/rss/1.0/modules/aggregation/': 'ag',

305

'http://purl.org/rss/1.0/modules/annotate/': 'annotate',

306

'http://media.tangent.org/rss/1.0/': 'audio',

307

'http://backend.userland.com/blogChannelModule': 'blogChannel',

308

'http://web.resource.org/cc/': 'cc',

309

'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',

310

'http://purl.org/rss/1.0/modules/company': 'co',

311

'http://purl.org/rss/1.0/modules/content/': 'content',

312

'http://my.theinfo.org/changed/1.0/rss/': 'cp',

313

'http://purl.org/dc/elements/1.1/': 'dc',

314

'http://purl.org/dc/terms/': 'dcterms',

315

'http://purl.org/rss/1.0/modules/email/': 'email',

316

'http://purl.org/rss/1.0/modules/event/': 'ev',

317

'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner',

318

'http://freshmeat.net/rss/fm/': 'fm',

319

'http://xmlns.com/foaf/0.1/': 'foaf',

320

'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo',

321

'http://postneo.com/icbm/': 'icbm',

322

'http://purl.org/rss/1.0/modules/image/': 'image',

323

'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes',

324

'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes',

325

'http://purl.org/rss/1.0/modules/link/': 'l',

326

'http://search.yahoo.com/mrss': 'media',

327

'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',

328

'http://prismstandard.org/namespaces/1.2/basic/': 'prism',

329

'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf',

330

'http://www.w3.org/2000/01/rdf-schema#': 'rdfs',

331

'http://purl.org/rss/1.0/modules/reference/': 'ref',

332

'http://purl.org/rss/1.0/modules/richequiv/': 'reqv',

333

'http://purl.org/rss/1.0/modules/search/': 'search',

334

'http://purl.org/rss/1.0/modules/slash/': 'slash',

335

'http://schemas.xmlsoap.org/soap/envelope/': 'soap',

336

'http://purl.org/rss/1.0/modules/servicestatus/': 'ss',

337

'http://hacks.benhammersley.com/rss/streaming/': 'str',

338

'http://purl.org/rss/1.0/modules/subscription/': 'sub',

339

'http://purl.org/rss/1.0/modules/syndication/': 'sy',

340

'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo',

341

'http://purl.org/rss/1.0/modules/threading/': 'thr',

342

'http://purl.org/rss/1.0/modules/textinput/': 'ti',

343

'http://madskills.com/public/xml/rss/module/trackback/':'trackback',

344

'http://wellformedweb.org/commentAPI/': 'wfw',

345

'http://purl.org/rss/1.0/modules/wiki/': 'wiki',

346

'http://www.w3.org/1999/xhtml': 'xhtml',

347

'http://www.w3.org/XML/1998/namespace': 'xml',

348

'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf'

349

}

350

_matchnamespaces = {}

351

352

can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']

353

can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']

354

can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']

355

html_types = ['text/html', 'application/xhtml+xml']

356

357

def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):

358

if _debug: sys.stderr.write('initializing FeedParser\n')

359

if not self._matchnamespaces:

360

for k, v in self.namespaces.items():

361

self._matchnamespaces[k.lower()] = v

362

self.feeddata = FeedParserDict() # feed-level data

363

self.encoding = encoding # character encoding

364

self.entries = [] # list of entry-level data

365

self.version = '' # feed type/version, see SUPPORTED_VERSIONS

366

self.namespacesInUse = {} # dictionary of namespaces defined by the feed

367

368

# the following are used internally to track state;

369

# this is really out of control and should be refactored

370

self.infeed = 0

371

self.inentry = 0

372

self.incontent = 0

373

self.intextinput = 0

374

self.inimage = 0

375

self.inauthor = 0

376

self.incontributor = 0

377

self.inpublisher = 0

378

self.insource = 0

379

self.sourcedata = FeedParserDict()

380

self.contentparams = FeedParserDict()

381

self._summaryKey = None

382

self.namespacemap = {}

383

self.elementstack = []

384

self.basestack = []

385

self.langstack = []

386

self.baseuri = baseuri or ''

387

self.lang = baselang or None

388

if baselang:

389

self.feeddata['language'] = baselang

390

391

def unknown_starttag(self, tag, attrs):

392

if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))

393

# normalize attrs

394

attrs = [(k.lower(), v) for k, v in attrs]

395

attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]

396

397

# track xml:base and xml:lang

398

attrsD = dict(attrs)

399

baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri

400

self.baseuri = _urljoin(self.baseuri, baseuri)

401

lang = attrsD.get('xml:lang', attrsD.get('lang'))

402

if lang == '':

403

# xml:lang could be explicitly set to '', we need to capture that

404

lang = None

405

elif lang is None:

406

# if no xml:lang is specified, use parent lang

407

lang = self.lang

408

if lang:

409

if tag in ('feed', 'rss', 'rdf:RDF'):

410

self.feeddata['language'] = lang

411

self.lang = lang

412

self.basestack.append(self.baseuri)

413

self.langstack.append(lang)

414

415

# track namespaces

416

for prefix, uri in attrs:

417

if prefix.startswith('xmlns:'):

418

self.trackNamespace(prefix[6:], uri)

419

elif prefix == 'xmlns':

420

self.trackNamespace(None, uri)

421

422

# track inline content

423

if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):

424

# element declared itself as escaped markup, but it isn't really

425

self.contentparams['type'] = 'application/xhtml+xml'

426

if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':

427

# Note: probably shouldn't simply recreate localname here, but

428

# our namespace handling isn't actually 100% correct in cases where

429

# the feed redefines the default namespace (which is actually

430

# the usual case for inline content, thanks Sam), so here we

431

# cheat and just reconstruct the element based on localname

432

# because that compensates for the bugs in our namespace handling.

433

# This will horribly munge inline content with non-empty qnames,

434

# but nobody actually does that, so I'm not fixing it.

435

tag = tag.split(':')[-1]

436

return self.handle_data('<%s%s>' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0)

437

438

# match namespaces

439

if tag.find(':') <> -1:

440

prefix, suffix = tag.split(':', 1)

441

else:

442

prefix, suffix = '', tag

443

prefix = self.namespacemap.get(prefix, prefix)

444

if prefix:

445

prefix = prefix + '_'

446

447

# special hack for better tracking of empty textinput/image elements in illformed feeds

448

if (not prefix) and tag not in ('title', 'link', 'description', 'name'):

449

self.intextinput = 0

450

if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):

451

self.inimage = 0

452

453

# call special handler (if defined) or default handler

454

methodname = '_start_' + prefix + suffix

455

try:

456

method = getattr(self, methodname)

457

return method(attrsD)

458

except AttributeError:

459

return self.push(prefix + suffix, 1)

460

461

def unknown_endtag(self, tag):

462

if _debug: sys.stderr.write('end %s\n' % tag)

463

# match namespaces

464

if tag.find(':') <> -1:

465

prefix, suffix = tag.split(':', 1)

466

else:

467

prefix, suffix = '', tag

468

prefix = self.namespacemap.get(prefix, prefix)

469

if prefix:

470

prefix = prefix + '_'

471

472

# call special handler (if defined) or default handler

473

methodname = '_end_' + prefix + suffix

474

try:

475

method = getattr(self, methodname)

476

method()

477

except AttributeError:

478

self.pop(prefix + suffix)

479

480

# track inline content

481

if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):

482

# element declared itself as escaped markup, but it isn't really

483

self.contentparams['type'] = 'application/xhtml+xml'

484

if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':

485

tag = tag.split(':')[-1]

486

self.handle_data('</%s>' % tag, escape=0)

487

488

# track xml:base and xml:lang going out of scope

489

if self.basestack:

490

self.basestack.pop()

491

if self.basestack and self.basestack[-1]:

492

self.baseuri = self.basestack[-1]

493

if self.langstack:

494

self.langstack.pop()

495

if self.langstack: # and (self.langstack[-1] is not None):

496

self.lang = self.langstack[-1]

497

498

def handle_charref(self, ref):

499

# called for each character reference, e.g. for ' ', ref will be '160'

500

if not self.elementstack: return

501

ref = ref.lower()

502

if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):

503

text = '&#%s;' % ref

504

else:

505

if ref[0] == 'x':

506

c = int(ref[1:], 16)

507

else:

508

c = int(ref)

509

text = unichr(c).encode('utf-8')

510

self.elementstack[-1][2].append(text)

511

512

def handle_entityref(self, ref):

513

# called for each entity reference, e.g. for '©', ref will be 'copy'

514

if not self.elementstack: return

515

if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)

516

if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):

517

text = '&%s;' % ref

518

else:

519

# entity resolution graciously donated by Aaron Swartz

520

def name2cp(k):

521

import htmlentitydefs

522

if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3

523

return htmlentitydefs.name2codepoint[k]

524

k = htmlentitydefs.entitydefs[k]

525

if k.startswith('&#') and k.endswith(';'):

526

return int(k[2:-1]) # not in latin-1

527

return ord(k)

528

try: name2cp(ref)

529

except KeyError: text = '&%s;' % ref

530

else: text = unichr(name2cp(ref)).encode('utf-8')

531

self.elementstack[-1][2].append(text)

532

533

def handle_data(self, text, escape=1):

534

# called for each block of plain text, i.e. outside of any tag and

535

# not containing any character or entity references

536

if not self.elementstack: return

537

if escape and self.contentparams.get('type') == 'application/xhtml+xml':

538

text = _xmlescape(text)

539

self.elementstack[-1][2].append(text)

540

541

def handle_comment(self, text):

542

# called for each comment, e.g.

543

pass

544

545

def handle_pi(self, text):

546

# called for each processing instruction, e.g. <?instruction>

547

pass

548

549

def handle_decl(self, text):

550

pass

551

552

def parse_declaration(self, i):

553

# override internal declaration handler to handle CDATA blocks

554

if _debug: sys.stderr.write('entering parse_declaration\n')

555

if self.rawdata[i:i+9] == '<![CDATA[':

556

k = self.rawdata.find(']]>', i)

557

if k == -1: k = len(self.rawdata)

558

self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)

559

return k+3

560

else:

561

k = self.rawdata.find('>', i)

562

return k+1

563

564

def mapContentType(self, contentType):

565

contentType = contentType.lower()

566

if contentType == 'text':

567

contentType = 'text/plain'

568

elif contentType == 'html':

569

contentType = 'text/html'

570

elif contentType == 'xhtml':

571

contentType = 'application/xhtml+xml'

572

return contentType

573

574

def trackNamespace(self, prefix, uri):

575

loweruri = uri.lower()

576

if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:

577

self.version = 'rss090'

578

if loweruri == 'http://purl.org/rss/1.0/' and not self.version:

579

self.version = 'rss10'

580

if loweruri == 'http://www.w3.org/2005/atom' and not self.version:

581

self.version = 'atom10'

582

if loweruri.find('backend.userland.com/rss') <> -1:

583

# match any backend.userland.com namespace

584

uri = 'http://backend.userland.com/rss'

585

loweruri = uri

586

if self._matchnamespaces.has_key(loweruri):

587

self.namespacemap[prefix] = self._matchnamespaces[loweruri]

588

self.namespacesInUse[self._matchnamespaces[loweruri]] = uri

589

else:

590

self.namespacesInUse[prefix or ''] = uri

591

592

def resolveURI(self, uri):

593

return _urljoin(self.baseuri or '', uri)

594

595

def decodeEntities(self, element, data):

596

return data

597

598

def push(self, element, expectingText):

599

self.elementstack.append([element, expectingText, []])

600

601

def pop(self, element, stripWhitespace=1):

602

if not self.elementstack: return

603

if self.elementstack[-1][0] != element: return

604

605

element, expectingText, pieces = self.elementstack.pop()

606

output = ''.join(pieces)

607

if stripWhitespace:

608

output = output.strip()

609

if not expectingText: return output

610

611

# decode base64 content

612

if base64 and self.contentparams.get('base64', 0):

613

try:

614

output = base64.decodestring(output)

615

except binascii.Error:

616

pass

617

except binascii.Incomplete:

618

pass

619

620

# resolve relative URIs

621

if (element in self.can_be_relative_uri) and output:

622

output = self.resolveURI(output)

623

624

# decode entities within embedded markup

625

if not self.contentparams.get('base64', 0):

626

output = self.decodeEntities(element, output)

627

628

# remove temporary cruft from contentparams

629

try:

630

del self.contentparams['mode']

631

except KeyError:

632

pass

633

try:

634

del self.contentparams['base64']

635

except KeyError:

636

pass

637

638

# resolve relative URIs within embedded markup

639

if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:

640

if element in self.can_contain_relative_uris:

641

output = _resolveRelativeURIs(output, self.baseuri, self.encoding)

642

643

# sanitize embedded markup

644

if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:

645

if element in self.can_contain_dangerous_markup:

646

output = _sanitizeHTML(output, self.encoding)

647

648

if self.encoding and type(output) != type(u''):

649

try:

650

output = unicode(output, self.encoding)

651

except:

652

pass

653

654

# categories/tags/keywords/whatever are handled in _end_category

655

if element == 'category':

656

return output

657

658

# store output in appropriate place(s)

659

if self.inentry and not self.insource:

660

if element == 'content':

661

self.entries[-1].setdefault(element, [])

662

contentparams = copy.deepcopy(self.contentparams)

663

contentparams['value'] = output

664

self.entries[-1][element].append(contentparams)

665

elif element == 'link':

666

self.entries[-1][element] = output

667

if output:

668

self.entries[-1]['links'][-1]['href'] = output

669

else:

670

if element == 'description':

671

element = 'summary'

672

self.entries[-1][element] = output

673

if self.incontent:

674

contentparams = copy.deepcopy(self.contentparams)

675

contentparams['value'] = output

676

self.entries[-1][element + '_detail'] = contentparams

677

elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage):

678

context = self._getContext()

679

if element == 'description':

680

element = 'subtitle'

681

context[element] = output

682

if element == 'link':

683

context['links'][-1]['href'] = output

684

elif self.incontent:

685

contentparams = copy.deepcopy(self.contentparams)

686

contentparams['value'] = output

687

context[element + '_detail'] = contentparams

688

return output

689

690

def pushContent(self, tag, attrsD, defaultContentType, expectingText):

691

self.incontent += 1

692

self.contentparams = FeedParserDict({

693

'type': self.mapContentType(attrsD.get('type', defaultContentType)),

694

'language': self.lang,

695

'base': self.baseuri})

696

self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)

697

self.push(tag, expectingText)

698

699

def popContent(self, tag):

700

value = self.pop(tag)

701

self.incontent -= 1

702

self.contentparams.clear()

703

return value

704

705

def _mapToStandardPrefix(self, name):

706

colonpos = name.find(':')

707

if colonpos <> -1:

708

prefix = name[:colonpos]

709

suffix = name[colonpos+1:]

710

prefix = self.namespacemap.get(prefix, prefix)

711

name = prefix + ':' + suffix

712

return name

713

714

def _getAttribute(self, attrsD, name):

715

return attrsD.get(self._mapToStandardPrefix(name))

716

717

def _isBase64(self, attrsD, contentparams):

718

if attrsD.get('mode', '') == 'base64':

719

return 1

720

if self.contentparams['type'].startswith('text/'):

721

return 0

722

if self.contentparams['type'].endswith('+xml'):

723

return 0

724

if self.contentparams['type'].endswith('/xml'):

725

return 0

726

return 1

727

728

def _itsAnHrefDamnIt(self, attrsD):

729

href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))

730

if href:

731

try:

732

del attrsD['url']

733

except KeyError:

734

pass

735

try:

736

del attrsD['uri']

737

except KeyError:

738

pass

739

attrsD['href'] = href

740

return attrsD

741

742

def _save(self, key, value):

743

context = self._getContext()

744

context.setdefault(key, value)

745

746

def _start_rss(self, attrsD):

747

versionmap = {'0.91': 'rss091u',

748

'0.92': 'rss092',

749

'0.93': 'rss093',

750

'0.94': 'rss094'}

751

if not self.version:

752

attr_version = attrsD.get('version', '')

753

version = versionmap.get(attr_version)

754

if version:

755

self.version = version

756

elif attr_version.startswith('2.'):

757

self.version = 'rss20'

758

else:

759

self.version = 'rss'

760

761

def _start_dlhottitles(self, attrsD):

762

self.version = 'hotrss'

763

764

def _start_channel(self, attrsD):

765

self.infeed = 1

766

self._cdf_common(attrsD)

767

_start_feedinfo = _start_channel

768

769

def _cdf_common(self, attrsD):

770

if attrsD.has_key('lastmod'):

771

self._start_modified({})

772

self.elementstack[-1][-1] = attrsD['lastmod']

773

self._end_modified()

774

if attrsD.has_key('href'):

775

self._start_link({})

776

self.elementstack[-1][-1] = attrsD['href']

777

self._end_link()

778

779

def _start_feed(self, attrsD):

780

self.infeed = 1

781

versionmap = {'0.1': 'atom01',

782

'0.2': 'atom02',

783

'0.3': 'atom03'}

784

if not self.version:

785

attr_version = attrsD.get('version')

786

version = versionmap.get(attr_version)

787

if version:

788

self.version = version

789

else:

790

self.version = 'atom'

791

792

def _end_channel(self):

793

self.infeed = 0

794

_end_feed = _end_channel

795

796

def _start_image(self, attrsD):

797

self.inimage = 1

798

self.push('image', 0)

799

context = self._getContext()

800

context.setdefault('image', FeedParserDict())

801

802

def _end_image(self):

803

self.pop('image')

804

self.inimage = 0

805

806

def _start_textinput(self, attrsD):

807

self.intextinput = 1

808

self.push('textinput', 0)

809

context = self._getContext()

810

context.setdefault('textinput', FeedParserDict())

811

_start_textInput = _start_textinput

812

813

def _end_textinput(self):

814

self.pop('textinput')

815

self.intextinput = 0

816

_end_textInput = _end_textinput

817

818

def _start_author(self, attrsD):

819

self.inauthor = 1

820

self.push('author', 1)

821

_start_managingeditor = _start_author

822

_start_dc_author = _start_author

823

_start_dc_creator = _start_author

824

_start_itunes_author = _start_author

825

826

def _end_author(self):

827

self.pop('author')

828

self.inauthor = 0

829

self._sync_author_detail()

830

_end_managingeditor = _end_author

831

_end_dc_author = _end_author

832

_end_dc_creator = _end_author

833

_end_itunes_author = _end_author

834

835

def _start_itunes_owner(self, attrsD):

836

self.inpublisher = 1

837

self.push('publisher', 0)

838

839

def _end_itunes_owner(self):

840

self.pop('publisher')

841

self.inpublisher = 0

842

self._sync_author_detail('publisher')

843

844

def _start_contributor(self, attrsD):

845

self.incontributor = 1

846

context = self._getContext()

847

context.setdefault('contributors', [])

848

context['contributors'].append(FeedParserDict())

849

self.push('contributor', 0)

850

851

def _end_contributor(self):

852

self.pop('contributor')

853

self.incontributor = 0

854

855

def _start_dc_contributor(self, attrsD):

856

self.incontributor = 1

857

context = self._getContext()

858

context.setdefault('contributors', [])

859

context['contributors'].append(FeedParserDict())

860

self.push('name', 0)

861

862

def _end_dc_contributor(self):

863

self._end_name()

864

self.incontributor = 0

865

866

def _start_name(self, attrsD):

867

self.push('name', 0)

868

_start_itunes_name = _start_name

869

870

def _end_name(self):

871

value = self.pop('name')

872

if self.inpublisher:

873

self._save_author('name', value, 'publisher')

874

elif self.inauthor:

875

self._save_author('name', value)

876

elif self.incontributor:

877

self._save_contributor('name', value)

878

elif self.intextinput:

879

context = self._getContext()

880

context['textinput']['name'] = value

881

_end_itunes_name = _end_name

882

883

def _start_width(self, attrsD):

884

self.push('width', 0)

885

886

def _end_width(self):

887

value = self.pop('width')

888

try:

889

value = int(value)

890

except:

891

value = 0

892

if self.inimage:

893

context = self._getContext()

894

context['image']['width'] = value

895

896

def _start_height(self, attrsD):

897

self.push('height', 0)

898

899

def _end_height(self):

900

value = self.pop('height')

901

try:

902

value = int(value)

903

except:

904

value = 0

905

if self.inimage:

906

context = self._getContext()

907

context['image']['height'] = value

908

909

def _start_url(self, attrsD):

910

self.push('href', 1)

911

_start_homepage = _start_url

912

_start_uri = _start_url

913

914

def _end_url(self):

915

value = self.pop('href')

916

if self.inauthor:

917

self._save_author('href', value)

918

elif self.incontributor:

919

self._save_contributor('href', value)

920

elif self.inimage:

921

context = self._getContext()

922

context['image']['href'] = value

923

elif self.intextinput:

924

context = self._getContext()

925

context['textinput']['link'] = value

926

_end_homepage = _end_url

927

_end_uri = _end_url

928

929

def _start_email(self, attrsD):

930

self.push('email', 0)

931

_start_itunes_email = _start_email

932

933

def _end_email(self):

934

value = self.pop('email')

935

if self.inpublisher:

936

self._save_author('email', value, 'publisher')

937

elif self.inauthor:

938

self._save_author('email', value)

939

elif self.incontributor:

940

self._save_contributor('email', value)

941

_end_itunes_email = _end_email

942

943

def _getContext(self):

944

if self.insource:

945

context = self.sourcedata

946

elif self.inentry:

947

context = self.entries[-1]

948

else:

949

context = self.feeddata

950

return context

951

952

def _save_author(self, key, value, prefix='author'):

953

context = self._getContext()

954

context.setdefault(prefix + '_detail', FeedParserDict())

955

context[prefix + '_detail'][key] = value

956

self._sync_author_detail()

957

958

def _save_contributor(self, key, value):

959

context = self._getContext()

960

context.setdefault('contributors', [FeedParserDict()])

961

context['contributors'][-1][key] = value

962

963

def _sync_author_detail(self, key='author'):

964

context = self._getContext()

965

detail = context.get('%s_detail' % key)

966

if detail:

967

name = detail.get('name')

968

email = detail.get('email')

969

if name and email:

970

context[key] = '%s (%s)' % (name, email)

971

elif name:

972

context[key] = name

973

elif email:

974

context[key] = email

975

else:

976

author = context.get(key)

977

if not author: return

978

emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author)

979

if not emailmatch: return

980

email = emailmatch.group(0)

981

# probably a better way to do the following, but it passes all the tests

982

author = author.replace(email, '')

983

author = author.replace('()', '')

984

author = author.strip()

985

if author and (author[0] == '('):

986

author = author[1:]

987

if author and (author[-1] == ')'):

988

author = author[:-1]

989

author = author.strip()

990

context.setdefault('%s_detail' % key, FeedParserDict())

991

context['%s_detail' % key]['name'] = author

992

context['%s_detail' % key]['email'] = email

993

994

def _start_subtitle(self, attrsD):

995

self.pushContent('subtitle', attrsD, 'text/plain', 1)

996

_start_tagline = _start_subtitle

997

_start_itunes_subtitle = _start_subtitle

998

999

def _end_subtitle(self):

1000

self.popContent('subtitle')

1001

_end_tagline = _end_subtitle

1002

_end_itunes_subtitle = _end_subtitle

1003

1004

def _start_rights(self, attrsD):

1005

self.pushContent('rights', attrsD, 'text/plain', 1)

1006

_start_dc_rights = _start_rights

1007

_start_copyright = _start_rights

1008

1009

def _end_rights(self):

1010

self.popContent('rights')

1011

_end_dc_rights = _end_rights

1012

_end_copyright = _end_rights

1013

1014

def _start_item(self, attrsD):

1015

self.entries.append(FeedParserDict())

1016

self.push('item', 0)

1017

self.inentry = 1

1018

self.guidislink = 0

1019

id = self._getAttribute(attrsD, 'rdf:about')

1020

if id:

1021

context = self._getContext()

1022

context['id'] = id

1023

self._cdf_common(attrsD)

1024

_start_entry = _start_item

1025

_start_product = _start_item

1026

1027

def _end_item(self):

1028

self.pop('item')

1029

self.inentry = 0

1030

_end_entry = _end_item

1031

1032

def _start_dc_language(self, attrsD):

1033

self.push('language', 1)

1034

_start_language = _start_dc_language

1035

1036

def _end_dc_language(self):

1037

self.lang = self.pop('language')

1038

_end_language = _end_dc_language

1039

1040

def _start_dc_publisher(self, attrsD):

1041

self.push('publisher', 1)

1042

_start_webmaster = _start_dc_publisher

1043

1044

def _end_dc_publisher(self):

1045

self.pop('publisher')

1046

self._sync_author_detail('publisher')

1047

_end_webmaster = _end_dc_publisher

1048

1049

def _start_published(self, attrsD):

1050

self.push('published', 1)

1051

_start_dcterms_issued = _start_published

1052

_start_issued = _start_published

1053

1054

def _end_published(self):

1055

value = self.pop('published')

1056

self._save('published_parsed', _parse_date(value))

1057

_end_dcterms_issued = _end_published

1058

_end_issued = _end_published

1059

1060

def _start_updated(self, attrsD):

1061

self.push('updated', 1)

1062

_start_modified = _start_updated

1063

_start_dcterms_modified = _start_updated

1064

_start_pubdate = _start_updated

1065

_start_dc_date = _start_updated

1066

1067

def _end_updated(self):

1068

value = self.pop('updated')

1069

parsed_value = _parse_date(value)

1070

self._save('updated_parsed', parsed_value)

1071

_end_modified = _end_updated

1072

_end_dcterms_modified = _end_updated

1073

_end_pubdate = _end_updated

1074

_end_dc_date = _end_updated

1075

1076

def _start_created(self, attrsD):

1077

self.push('created', 1)

1078

_start_dcterms_created = _start_created

1079

1080

def _end_created(self):

1081

value = self.pop('created')

1082

self._save('created_parsed', _parse_date(value))

1083

_end_dcterms_created = _end_created

1084

1085

def _start_expirationdate(self, attrsD):

1086

self.push('expired', 1)

1087

1088

def _end_expirationdate(self):

1089

self._save('expired_parsed', _parse_date(self.pop('expired')))

1090

1091

def _start_cc_license(self, attrsD):

1092

self.push('license', 1)

1093

value = self._getAttribute(attrsD, 'rdf:resource')

1094

if value:

1095

self.elementstack[-1][2].append(value)

1096

self.pop('license')

1097

1098

def _start_creativecommons_license(self, attrsD):

1099

self.push('license', 1)

1100

1101

def _end_creativecommons_license(self):

1102

self.pop('license')

1103

1104

def _addTag(self, term, scheme, label):

1105

context = self._getContext()

1106

tags = context.setdefault('tags', [])

1107

if (not term) and (not scheme) and (not label): return

1108

value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})

1109

if value not in tags:

1110

tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label}))

1111

1112

def _start_category(self, attrsD):

1113

if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))

1114

term = attrsD.get('term')

1115

scheme = attrsD.get('scheme', attrsD.get('domain'))

1116

label = attrsD.get('label')

1117

self._addTag(term, scheme, label)

1118

self.push('category', 1)

1119

_start_dc_subject = _start_category

1120

_start_keywords = _start_category

1121

1122

def _end_itunes_keywords(self):

1123

for term in self.pop('itunes_keywords').split():

1124

self._addTag(term, 'http://www.itunes.com/', None)

1125

1126

def _start_itunes_category(self, attrsD):

1127

self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)

1128

self.push('category', 1)

1129

1130

def _end_category(self):

1131

value = self.pop('category')

1132

if not value: return

1133

context = self._getContext()

1134

tags = context['tags']

1135

if value and len(tags) and not tags[-1]['term']:

1136

tags[-1]['term'] = value

1137

else:

1138

self._addTag(value, None, None)

1139

_end_dc_subject = _end_category

1140

_end_keywords = _end_category

1141

_end_itunes_category = _end_category

1142

1143

def _start_cloud(self, attrsD):

1144

self._getContext()['cloud'] = FeedParserDict(attrsD)

1145

1146

def _start_link(self, attrsD):

1147

attrsD.setdefault('rel', 'alternate')

1148

attrsD.setdefault('type', 'text/html')

1149

attrsD = self._itsAnHrefDamnIt(attrsD)

1150

if attrsD.has_key('href'):

1151

attrsD['href'] = self.resolveURI(attrsD['href'])

1152

expectingText = self.infeed or self.inentry or self.insource

1153

context = self._getContext()

1154

context.setdefault('links', [])

1155

context['links'].append(FeedParserDict(attrsD))

1156

if attrsD['rel'] == 'enclosure':

1157

self._start_enclosure(attrsD)

1158

if attrsD.has_key('href'):

1159

expectingText = 0

1160

if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):

1161

context['link'] = attrsD['href']

1162

else:

1163

self.push('link', expectingText)

1164

_start_producturl = _start_link

1165

1166

def _end_link(self):

1167

value = self.pop('link')

1168

context = self._getContext()

1169

if self.intextinput:

1170

context['textinput']['link'] = value

1171

if self.inimage:

1172

context['image']['link'] = value

1173

_end_producturl = _end_link

1174

1175

def _start_guid(self, attrsD):

1176

self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')

1177

self.push('id', 1)

1178

1179

def _end_guid(self):

1180

value = self.pop('id')

1181

self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))

1182

if self.guidislink:

1183

# guid acts as link, but only if 'ispermalink' is not present or is 'true',

1184

# and only if the item doesn't already have a link element

1185

self._save('link', value)

1186

1187

def _start_title(self, attrsD):

1188

self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)

1189

_start_dc_title = _start_title

1190

_start_media_title = _start_title

1191

1192

def _end_title(self):

1193

value = self.popContent('title')

1194

context = self._getContext()

1195

if self.intextinput:

1196

context['textinput']['title'] = value

1197

elif self.inimage:

1198

context['image']['title'] = value

1199

_end_dc_title = _end_title

1200

_end_media_title = _end_title

1201

1202

def _start_description(self, attrsD):

1203

context = self._getContext()

1204

if context.has_key('summary'):

1205

self._summaryKey = 'content'

1206

self._start_content(attrsD)

1207

else:

1208

self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)

1209

1210

def _start_abstract(self, attrsD):

1211

self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)

1212

1213

def _end_description(self):

1214

if self._summaryKey == 'content':

1215

self._end_content()

1216

else:

1217

value = self.popContent('description')

1218

context = self._getContext()

1219

if self.intextinput:

1220

context['textinput']['description'] = value

1221

elif self.inimage:

1222

context['image']['description'] = value

1223

self._summaryKey = None

1224

_end_abstract = _end_description

1225

1226

def _start_info(self, attrsD):

1227

self.pushContent('info', attrsD, 'text/plain', 1)

1228

_start_feedburner_browserfriendly = _start_info

1229

1230

def _end_info(self):

1231

self.popContent('info')

1232

_end_feedburner_browserfriendly = _end_info

1233

1234

def _start_generator(self, attrsD):

1235

if attrsD:

1236

attrsD = self._itsAnHrefDamnIt(attrsD)

1237

if attrsD.has_key('href'):

1238

attrsD['href'] = self.resolveURI(attrsD['href'])

1239

self._getContext()['generator_detail'] = FeedParserDict(attrsD)

1240

self.push('generator', 1)

1241

1242

def _end_generator(self):

1243

value = self.pop('generator')

1244

context = self._getContext()

1245

if context.has_key('generator_detail'):

1246

context['generator_detail']['name'] = value

1247

1248

def _start_admin_generatoragent(self, attrsD):

1249

self.push('generator', 1)

1250

value = self._getAttribute(attrsD, 'rdf:resource')

1251

if value:

1252

self.elementstack[-1][2].append(value)

1253

self.pop('generator')

1254

self._getContext()['generator_detail'] = FeedParserDict({'href': value})

1255

1256

def _start_admin_errorreportsto(self, attrsD):

1257

self.push('errorreportsto', 1)

1258

value = self._getAttribute(attrsD, 'rdf:resource')

1259

if value:

1260

self.elementstack[-1][2].append(value)

1261

self.pop('errorreportsto')

1262

1263

def _start_summary(self, attrsD):

1264

context = self._getContext()

1265

if context.has_key('summary'):

1266

self._summaryKey = 'content'

1267

self._start_content(attrsD)

1268

else:

1269

self._summaryKey = 'summary'

1270

self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)

1271

_start_itunes_summary = _start_summary

1272

1273

def _end_summary(self):

1274

if self._summaryKey == 'content':

1275

self._end_content()

1276

else:

1277

self.popContent(self._summaryKey or 'summary')

1278

self._summaryKey = None

1279

_end_itunes_summary = _end_summary

1280

1281

def _start_enclosure(self, attrsD):

1282

attrsD = self._itsAnHrefDamnIt(attrsD)

1283

self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD))

1284

href = attrsD.get('href')

1285

if href:

1286

context = self._getContext()

1287

if not context.get('id'):

1288

context['id'] = href

1289

1290

def _start_source(self, attrsD):

1291

self.insource = 1

1292

1293

def _end_source(self):

1294

self.insource = 0

1295

self._getContext()['source'] = copy.deepcopy(self.sourcedata)

1296

self.sourcedata.clear()

1297

1298

def _start_content(self, attrsD):

1299

self.pushContent('content', attrsD, 'text/plain', 1)

1300

src = attrsD.get('src')

1301

if src:

1302

self.contentparams['src'] = src

1303

self.push('content', 1)

1304

1305

def _start_prodlink(self, attrsD):

1306

self.pushContent('content', attrsD, 'text/html', 1)

1307

1308

def _start_body(self, attrsD):

1309

self.pushContent('content', attrsD, 'application/xhtml+xml', 1)

1310

_start_xhtml_body = _start_body

1311

1312

def _start_content_encoded(self, attrsD):

1313

self.pushContent('content', attrsD, 'text/html', 1)

1314

_start_fullitem = _start_content_encoded

1315

1316

def _end_content(self):

1317

copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)

1318

value = self.popContent('content')

1319

if copyToDescription:

1320

self._save('description', value)

1321

_end_body = _end_content

1322

_end_xhtml_body = _end_content

1323

_end_content_encoded = _end_content

1324

_end_fullitem = _end_content

1325

_end_prodlink = _end_content

1326

1327

def _start_itunes_image(self, attrsD):

1328

self.push('itunes_image', 0)

1329

self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})

1330

_start_itunes_link = _start_itunes_image

1331

1332

def _end_itunes_block(self):

1333

value = self.pop('itunes_block', 0)

1334

self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0

1335

1336

def _end_itunes_explicit(self):

1337

value = self.pop('itunes_explicit', 0)

1338

self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0

1339

1340

if _XML_AVAILABLE:

1341

class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):

1342

def __init__(self, baseuri, baselang, encoding):

1343

if _debug: sys.stderr.write('trying StrictFeedParser\n')

1344

xml.sax.handler.ContentHandler.__init__(self)

1345

_FeedParserMixin.__init__(self, baseuri, baselang, encoding)

1346

self.bozo = 0

1347

self.exc = None

1348

1349

def startPrefixMapping(self, prefix, uri):

1350

self.trackNamespace(prefix, uri)

1351

1352

def startElementNS(self, name, qname, attrs):

1353

namespace, localname = name

1354

lowernamespace = str(namespace or '').lower()

1355

if lowernamespace.find('backend.userland.com/rss') <> -1:

1356

# match any backend.userland.com namespace

1357

namespace = 'http://backend.userland.com/rss'

1358

lowernamespace = namespace

1359

if qname and qname.find(':') > 0:

1360

givenprefix = qname.split(':')[0]

1361

else:

1362

givenprefix = None

1363

prefix = self._matchnamespaces.get(lowernamespace, givenprefix)

1364

if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):

1365

raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix

1366

if prefix:

1367

localname = prefix + ':' + localname

1368

localname = str(localname).lower()

1369

if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))

1370

1371

# qname implementation is horribly broken in Python 2.1 (it

1372

# doesn't report any), and slightly broken in Python 2.2 (it

1373

# doesn't report the xml: namespace). So we match up namespaces

1374

# with a known list first, and then possibly override them with

1375

# the qnames the SAX parser gives us (if indeed it gives us any

1376

# at all). Thanks to MatejC for helping me test this and

1377

# tirelessly telling me that it didn't work yet.

1378

attrsD = {}

1379

for (namespace, attrlocalname), attrvalue in attrs._attrs.items():

1380

lowernamespace = (namespace or '').lower()

1381

prefix = self._matchnamespaces.get(lowernamespace, '')

1382

if prefix:

1383

attrlocalname = prefix + ':' + attrlocalname

1384

attrsD[str(attrlocalname).lower()] = attrvalue

1385

for qname in attrs.getQNames():

1386

attrsD[str(qname).lower()] = attrs.getValueByQName(qname)

1387

self.unknown_starttag(localname, attrsD.items())

1388

1389

def characters(self, text):

1390

self.handle_data(text)

1391

1392

def endElementNS(self, name, qname):

1393

namespace, localname = name

1394

lowernamespace = str(namespace or '').lower()

1395

if qname and qname.find(':') > 0:

1396

givenprefix = qname.split(':')[0]

1397

else:

1398

givenprefix = ''

1399

prefix = self._matchnamespaces.get(lowernamespace, givenprefix)

1400

if prefix:

1401

localname = prefix + ':' + localname

1402

localname = str(localname).lower()

1403

self.unknown_endtag(localname)

1404

1405

def error(self, exc):

1406

self.bozo = 1

1407

self.exc = exc

1408

1409

def fatalError(self, exc):

1410

self.error(exc)

1411

raise exc

1412

1413

class _BaseHTMLProcessor(sgmllib.SGMLParser):

1414

elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',

1415

'img', 'input', 'isindex', 'link', 'meta', 'param']

1416

1417

def __init__(self, encoding):

1418

self.encoding = encoding

1419

if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)

1420

sgmllib.SGMLParser.__init__(self)

1421

1422

def reset(self):

1423

self.pieces = []

1424

sgmllib.SGMLParser.reset(self)

1425

1426

def _shorttag_replace(self, match):

1427

tag = match.group(1)

1428

if tag in self.elements_no_end_tag:

1429

return '<' + tag + ' />'

1430

else:

1431

return '<' + tag + '></' + tag + '>'

1432

1433

def feed(self, data):

1434

data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)

1435

#data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace

1436

data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data)

1437

data = data.replace(''', "'")

1438

data = data.replace('"', '"')

1439

if self.encoding and type(data) == type(u''):

1440

data = data.encode(self.encoding)

1441

sgmllib.SGMLParser.feed(self, data)

1442

1443

def normalize_attrs(self, attrs):

1444

# utility method to be called by descendants

1445

attrs = [(k.lower(), v) for k, v in attrs]

1446

attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]

1447

return attrs

1448

1449

def unknown_starttag(self, tag, attrs):

1450

# called for each start tag

1451

# attrs is a list of (attr, value) tuples

1452

# e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]

1453

if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)

1454

uattrs = []

1455

# thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds

1456

for key, value in attrs:

1457

if type(value) != type(u''):

1458

value = unicode(value, self.encoding)

1459

uattrs.append((unicode(key, self.encoding), value))

1460

strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)

1461

if tag in self.elements_no_end_tag:

1462

self.pieces.append('<%(tag)s%(strattrs)s />' % locals())

1463

else:

1464

self.pieces.append('<%(tag)s%(strattrs)s>' % locals())

1465

1466

def unknown_endtag(self, tag):

1467

# called for each end tag, e.g. for </pre>, tag will be 'pre'

1468

# Reconstruct the original end tag.

1469

if tag not in self.elements_no_end_tag:

1470

self.pieces.append("</%(tag)s>" % locals())

1471

1472

def handle_charref(self, ref):

1473

# called for each character reference, e.g. for ' ', ref will be '160'

1474

# Reconstruct the original character reference.

1475

self.pieces.append('&#%(ref)s;' % locals())

1476

1477

def handle_entityref(self, ref):

1478

# called for each entity reference, e.g. for '©', ref will be 'copy'

1479

# Reconstruct the original entity reference.

1480

self.pieces.append('&%(ref)s;' % locals())

1481

1482

def handle_data(self, text):

1483

# called for each block of plain text, i.e. outside of any tag and

1484

# not containing any character or entity references

1485

# Store the original text verbatim.

1486

if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)

1487

self.pieces.append(text)

1488

1489

def handle_comment(self, text):

1490

# called for each HTML comment, e.g.

1491

# Reconstruct the original comment.

1492

self.pieces.append('' % locals())

1493

1494

def handle_pi(self, text):

1495

# called for each processing instruction, e.g. <?instruction>

1496

# Reconstruct original processing instruction.

1497

self.pieces.append('<?%(text)s>' % locals())

1498

1499

def handle_decl(self, text):

1500

# called for the DOCTYPE, if present, e.g.

1501

# <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"

1502

# "http://www.w3.org/TR/html4/loose.dtd">

1503

# Reconstruct original DOCTYPE

1504

self.pieces.append('<!%(text)s>' % locals())

1505

1506

_new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match

1507

def _scan_name(self, i, declstartpos):

1508

rawdata = self.rawdata

1509

n = len(rawdata)

1510

if i == n:

1511

return None, -1

1512

m = self._new_declname_match(rawdata, i)

1513

if m:

1514

s = m.group()

1515

name = s.strip()

1516

if (i + len(s)) == n:

1517

return None, -1 # end of buffer

1518

return name.lower(), m.end()

1519

else:

1520

self.handle_data(rawdata)

1521

# self.updatepos(declstartpos, i)

1522

return None, -1

1523

1524

def output(self):

1525

'''Return processed HTML as a single string'''

1526

return ''.join([str(p) for p in self.pieces])

1527

1528

class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):

1529

def __init__(self, baseuri, baselang, encoding):

1530

sgmllib.SGMLParser.__init__(self)

1531

_FeedParserMixin.__init__(self, baseuri, baselang, encoding)

1532

1533

def decodeEntities(self, element, data):

1534

data = data.replace('<', '<')

1535

data = data.replace('<', '<')

1536

data = data.replace('>', '>')

1537

data = data.replace('>', '>')

1538

data = data.replace('&', '&')

1539

data = data.replace('&', '&')

1540

data = data.replace('"', '"')

1541

data = data.replace('"', '"')

1542

data = data.replace(''', ''')

1543

data = data.replace(''', ''')

1544

if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):

1545

data = data.replace('<', '<')

1546

data = data.replace('>', '>')

1547

data = data.replace('&', '&')

1548

data = data.replace('"', '"')

1549

data = data.replace(''', "'")

1550

return data

1551

1552

class _RelativeURIResolver(_BaseHTMLProcessor):

1553

relative_uris = [('a', 'href'),

1554

('applet', 'codebase'),

1555

('area', 'href'),

1556

('blockquote', 'cite'),

1557

('body', 'background'),

1558

('del', 'cite'),

1559

('form', 'action'),

1560

('frame', 'longdesc'),

1561

('frame', 'src'),

1562

('iframe', 'longdesc'),

1563

('iframe', 'src'),

1564

('head', 'profile'),

1565

('img', 'longdesc'),

1566

('img', 'src'),

1567

('img', 'usemap'),

1568

('input', 'src'),

1569

('input', 'usemap'),

1570

('ins', 'cite'),

1571

('link', 'href'),

1572

('object', 'classid'),

1573

('object', 'codebase'),

1574

('object', 'data'),

1575

('object', 'usemap'),

1576

('q', 'cite'),

1577

('script', 'src')]

1578

1579

def __init__(self, baseuri, encoding):

1580

_BaseHTMLProcessor.__init__(self, encoding)

1581

self.baseuri = baseuri

1582

1583

def resolveURI(self, uri):

1584

return _urljoin(self.baseuri, uri)

1585

1586

def unknown_starttag(self, tag, attrs):

1587

attrs = self.normalize_attrs(attrs)

1588

attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]

1589

_BaseHTMLProcessor.unknown_starttag(self, tag, attrs)

1590

1591

def _resolveRelativeURIs(htmlSource, baseURI, encoding):

1592

if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')

1593

p = _RelativeURIResolver(baseURI, encoding)

1594

p.feed(htmlSource)

1595

return p.output()

1596

1597

class _HTMLSanitizer(_BaseHTMLProcessor):

1598

acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',

1599

'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',

1600

'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',

1601

'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',

1602

'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',

1603

'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',

1604

'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',

1605

'thead', 'tr', 'tt', 'u', 'ul', 'var']

1606

1607

acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',

1608

'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',

1609

'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',

1610

'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',

1611

'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',

1612

'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',

1613

'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',

1614

'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',

1615

'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',

1616

'usemap', 'valign', 'value', 'vspace', 'width']

1617

1618

unacceptable_elements_with_end_tag = ['script', 'applet']

1619

1620

def reset(self):

1621

_BaseHTMLProcessor.reset(self)

1622

self.unacceptablestack = 0

1623

1624

def unknown_starttag(self, tag, attrs):

1625

if not tag in self.acceptable_elements:

1626

if tag in self.unacceptable_elements_with_end_tag:

1627

self.unacceptablestack += 1

1628

return

1629

attrs = self.normalize_attrs(attrs)

1630

attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]

1631

_BaseHTMLProcessor.unknown_starttag(self, tag, attrs)

1632

1633

def unknown_endtag(self, tag):

1634

if not tag in self.acceptable_elements:

1635

if tag in self.unacceptable_elements_with_end_tag:

1636

self.unacceptablestack -= 1

1637

return

1638

_BaseHTMLProcessor.unknown_endtag(self, tag)

1639

1640

def handle_pi(self, text):

1641

pass

1642

1643

def handle_decl(self, text):

1644

pass

1645

1646

def handle_data(self, text):

1647

if not self.unacceptablestack:

1648

_BaseHTMLProcessor.handle_data(self, text)

1649

1650

def _sanitizeHTML(htmlSource, encoding):

1651

p = _HTMLSanitizer(encoding)

1652

p.feed(htmlSource)

1653

data = p.output()

1654

if TIDY_MARKUP:

1655

# loop through list of preferred Tidy interfaces looking for one that's installed,

1656

# then set up a common _tidy function to wrap the interface-specific API.

1657

_tidy = None

1658

for tidy_interface in PREFERRED_TIDY_INTERFACES:

1659

try:

1660

if tidy_interface == "uTidy":

1661

from tidy import parseString as _utidy

1662

def _tidy(data, **kwargs):

1663

return str(_utidy(data, **kwargs))

1664

break

1665

elif tidy_interface == "mxTidy":

1666

from mx.Tidy import Tidy as _mxtidy

1667

def _tidy(data, **kwargs):

1668

nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)

1669

return data

1670

break

1671

except:

1672

pass

1673

if _tidy:

1674

utf8 = type(data) == type(u'')

1675

if utf8:

1676

data = data.encode('utf-8')

1677

data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")

1678

if utf8:

1679

data = unicode(data, 'utf-8')

1680

if data.count('<body'):

1681

data = data.split('<body', 1)[1]

1682

if data.count('>'):

1683

data = data.split('>', 1)[1]

1684

if data.count('</body'):

1685

data = data.split('</body', 1)[0]

1686

data = data.strip().replace('\r\n', '\n')

1687

return data

1688

1689

class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):

1690

def http_error_default(self, req, fp, code, msg, headers):

1691

if ((code / 100) == 3) and (code != 304):

1692

return self.http_error_302(req, fp, code, msg, headers)

1693

infourl = urllib.addinfourl(fp, headers, req.get_full_url())

1694

infourl.status = code

1695

return infourl

1696

1697

def http_error_302(self, req, fp, code, msg, headers):

1698

if headers.dict.has_key('location'):

1699

infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)

1700

else:

1701

infourl = urllib.addinfourl(fp, headers, req.get_full_url())

1702

if not hasattr(infourl, 'status'):

1703

infourl.status = code

1704

return infourl

1705

1706

def http_error_301(self, req, fp, code, msg, headers):

1707

if headers.dict.has_key('location'):

1708

infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)

1709

else:

1710

infourl = urllib.addinfourl(fp, headers, req.get_full_url())

1711

if not hasattr(infourl, 'status'):

1712

infourl.status = code

1713

return infourl

1714

1715

http_error_300 = http_error_302

1716

http_error_303 = http_error_302

1717

http_error_307 = http_error_302

1718

1719

def http_error_401(self, req, fp, code, msg, headers):

1720

# Check if

1721

# - server requires digest auth, AND

1722

# - we tried (unsuccessfully) with basic auth, AND

1723

# - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)

1724

# If all conditions hold, parse authentication information

1725

# out of the Authorization header we sent the first time

1726

# (for the username and password) and the WWW-Authenticate

1727

# header the server sent back (for the realm) and retry

1728

# the request with the appropriate digest auth headers instead.

1729

# This evil genius hack has been brought to you by Aaron Swartz.

1730

host = urlparse.urlparse(req.get_full_url())[1]

1731

try:

1732

assert sys.version.split()[0] >= '2.3.3'

1733

assert base64 != None

1734

user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')

1735

realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]

1736

self.add_password(realm, host, user, passw)

1737

retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)

1738

self.reset_retry_count()

1739

return retry

1740

except:

1741

return self.http_error_default(req, fp, code, msg, headers)

1742

1743

def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):

1744

"""URL, filename, or string --> stream

1745

1746

This function lets you define parsers that take any input source

1747

(URL, pathname to local or network file, or actual data as a string)

1748

and deal with it in a uniform manner. Returned object is guaranteed

1749

to have all the basic stdio read methods (read, readline, readlines).

1750

Just .close() the object when you're done with it.

1751

1752

If the etag argument is supplied, it will be used as the value of an

1753

If-None-Match request header.

1754

1755

If the modified argument is supplied, it must be a tuple of 9 integers

1756

as returned by gmtime() in the standard Python time module. This MUST

1757

be in GMT (Greenwich Mean Time). The formatted date/time will be used

1758

as the value of an If-Modified-Since request header.

1759

1760

If the agent argument is supplied, it will be used as the value of a

1761

User-Agent request header.

1762

1763

If the referrer argument is supplied, it will be used as the value of a

1764

Referer[sic] request header.

1765

1766

If handlers is supplied, it is a list of handlers used to build a

1767

urllib2 opener.

1768

"""

1769

1770

if hasattr(url_file_stream_or_string, 'read'):

1771

return url_file_stream_or_string

1772

1773

if url_file_stream_or_string == '-':

1774

return sys.stdin

1775

1776

if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):

1777

if not agent:

1778

agent = USER_AGENT

1779

# test for inline user:password for basic auth

1780

auth = None

1781

if base64:

1782

urltype, rest = urllib.splittype(url_file_stream_or_string)

1783

realhost, rest = urllib.splithost(rest)

1784

if realhost:

1785

user_passwd, realhost = urllib.splituser(realhost)

1786

if user_passwd:

1787

url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)

1788

auth = base64.encodestring(user_passwd).strip()

1789

# try to open with urllib2 (to use optional headers)

1790

request = urllib2.Request(url_file_stream_or_string)

1791

request.add_header('User-Agent', agent)

1792

if etag:

1793

request.add_header('If-None-Match', etag)

1794

if modified:

1795

# format into an RFC 1123-compliant timestamp. We can't use

1796

# time.strftime() since the %a and %b directives can be affected

1797

# by the current locale, but RFC 2616 states that dates must be

1798

# in English.

1799

short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

1800

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

1801

request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))

1802

if referrer:

1803

request.add_header('Referer', referrer)

1804

if gzip and zlib:

1805

request.add_header('Accept-encoding', 'gzip, deflate')

1806

elif gzip:

1807

request.add_header('Accept-encoding', 'gzip')

1808

elif zlib:

1809

request.add_header('Accept-encoding', 'deflate')

1810

else:

1811

request.add_header('Accept-encoding', '')

1812

if auth:

1813

request.add_header('Authorization', 'Basic %s' % auth)

1814

if ACCEPT_HEADER:

1815

request.add_header('Accept', ACCEPT_HEADER)

1816

request.add_header('A-IM', 'feed') # RFC 3229 support

1817

opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))

1818

opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent

1819

try:

1820

return opener.open(request)

1821

finally:

1822

opener.close() # JohnD

1823

1824

# try to open with native open function (if url_file_stream_or_string is a filename)

1825

try:

1826

return open(url_file_stream_or_string)

1827

except:

1828

pass

1829

1830

# treat url_file_stream_or_string as string

1831

return _StringIO(str(url_file_stream_or_string))

1832

1833

_date_handlers = []

1834

def registerDateHandler(func):

1835

'''Register a date handler function (takes string, returns 9-tuple date in GMT)'''

1836

_date_handlers.insert(0, func)

1837

1838

# ISO-8601 date parsing routines written by Fazal Majid.

1839

# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601

1840

# parser is beyond the scope of feedparser and would be a worthwhile addition

1841

# to the Python library.

1842

# A single regular expression cannot parse ISO 8601 date formats into groups

1843

# as the standard is highly irregular (for instance is 030104 2003-01-04 or

1844

# 0301-04-01), so we use templates instead.

1845

# Please note the order in templates is significant because we need a

1846

# greedy match.

1847

_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',

1848

'YY-?MM-?DD', 'YY-?OOO', 'YYYY',

1849

'-YY-?MM', '-OOO', '-YY',

1850

'--MM-?DD', '--MM',

1851

'---DD',

1852

'CC', '']

1853

_iso8601_re = [

1854

tmpl.replace(

1855

'YYYY', r'(?P<year>\d{4})').replace(

1856

'YY', r'(?P<year>\d\d)').replace(

1857

'MM', r'(?P<month>[01]\d)').replace(

1858

'DD', r'(?P<day>[0123]\d)').replace(

1859

'OOO', r'(?P<ordinal>[0123]\d\d)').replace(

1860

'CC', r'(?P<century>\d\d$)')

1861

+ r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'

1862

+ r'(:(?P<second>\d{2}))?'

1863

+ r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'

1864

for tmpl in _iso8601_tmpl]

1865

del tmpl

1866

_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]

1867

del regex

1868

def _parse_date_iso8601(dateString):

1869

'''Parse a variety of ISO-8601-compatible formats like 20040105'''

1870

m = None

1871

for _iso8601_match in _iso8601_matches:

1872

m = _iso8601_match(dateString)

1873

if m: break

1874

if not m: return

1875

if m.span() == (0, 0): return

1876

params = m.groupdict()

1877

ordinal = params.get('ordinal', 0)

1878

if ordinal:

1879

ordinal = int(ordinal)

1880

else:

1881

ordinal = 0

1882

year = params.get('year', '--')

1883

if not year or year == '--':

1884

year = time.gmtime()[0]

1885

elif len(year) == 2:

1886

# ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993

1887

year = 100 * int(time.gmtime()[0] / 100) + int(year)

1888

else:

1889

year = int(year)

1890

month = params.get('month', '-')

1891

if not month or month == '-':

1892

# ordinals are NOT normalized by mktime, we simulate them

1893

# by setting month=1, day=ordinal

1894

if ordinal:

1895

month = 1

1896

else:

1897

month = time.gmtime()[1]

1898

month = int(month)

1899

day = params.get('day', 0)

1900

if not day:

1901

# see above

1902

if ordinal:

1903

day = ordinal

1904

elif params.get('century', 0) or \

1905

params.get('year', 0) or params.get('month', 0):

1906

day = 1

1907

else:

1908

day = time.gmtime()[2]

1909

else:

1910

day = int(day)

1911

# special case of the century - is the first year of the 21st century

1912

# 2000 or 2001 ? The debate goes on...

1913

if 'century' in params.keys():

1914

year = (int(params['century']) - 1) * 100 + 1

1915

# in ISO 8601 most fields are optional

1916

for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:

1917

if not params.get(field, None):

1918

params[field] = 0

1919

hour = int(params.get('hour', 0))

1920

minute = int(params.get('minute', 0))

1921

second = int(params.get('second', 0))

1922

# weekday is normalized by mktime(), we can ignore it

1923

weekday = 0

1924

# daylight savings is complex, but not needed for feedparser's purposes

1925

# as time zones, if specified, include mention of whether it is active

1926

# (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and

1927

# and most implementations have DST bugs

1928

daylight_savings_flag = 0

1929

tm = [year, month, day, hour, minute, second, weekday,

1930

ordinal, daylight_savings_flag]

1931

# ISO 8601 time zone adjustments

1932

tz = params.get('tz')

1933

if tz and tz != 'Z':

1934

if tz[0] == '-':

1935

tm[3] += int(params.get('tzhour', 0))

1936

tm[4] += int(params.get('tzmin', 0))

1937

elif tz[0] == '+':

1938

tm[3] -= int(params.get('tzhour', 0))

1939

tm[4] -= int(params.get('tzmin', 0))

1940

else:

1941

return None

1942

# Python's time.mktime() is a wrapper around the ANSI C mktime(3c)

1943

# which is guaranteed to normalize d/m/y/h/m/s.

1944

# Many implementations have bugs, but we'll pretend they don't.

1945

return time.localtime(time.mktime(tm))

1946

registerDateHandler(_parse_date_iso8601)

1947

1948

# 8-bit date handling routines written by ytrewq1.

1949

_korean_year = u'\ub144' # b3e2 in euc-kr

1950

_korean_month = u'\uc6d4' # bff9 in euc-kr

1951

_korean_day = u'\uc77c' # c0cf in euc-kr

1952

_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr

1953

_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr

1954

1955

_korean_onblog_date_re = \

1956

re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \

1957

(_korean_year, _korean_month, _korean_day))

1958

_korean_nate_date_re = \

1959

re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \

1960

(_korean_am, _korean_pm))

1961

def _parse_date_onblog(dateString):

1962

'''Parse a string according to the OnBlog 8-bit date format'''

1963

m = _korean_onblog_date_re.match(dateString)

1964

if not m: return

1965

w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \

1966

{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\

1967

'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\

1968

'zonediff': '+09:00'}

1969

if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)

1970

return _parse_date_w3dtf(w3dtfdate)

1971

registerDateHandler(_parse_date_onblog)

1972

1973

def _parse_date_nate(dateString):

1974

'''Parse a string according to the Nate 8-bit date format'''

1975

m = _korean_nate_date_re.match(dateString)

1976

if not m: return

1977

hour = int(m.group(5))

1978

ampm = m.group(4)

1979

if (ampm == _korean_pm):

1980

hour += 12

1981

hour = str(hour)

1982

if len(hour) == 1:

1983

hour = '0' + hour

1984

w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \

1985

{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\

1986

'hour': hour, 'minute': m.group(6), 'second': m.group(7),\

1987

'zonediff': '+09:00'}

1988

if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)

1989

return _parse_date_w3dtf(w3dtfdate)

1990

registerDateHandler(_parse_date_nate)

1991

1992

_mssql_date_re = \

1993

re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')

1994

def _parse_date_mssql(dateString):

1995

'''Parse a string according to the MS SQL date format'''

1996

m = _mssql_date_re.match(dateString)

1997

if not m: return

1998

w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \

1999

{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\

2000

'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\

2001

'zonediff': '+09:00'}

2002

if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)

2003

return _parse_date_w3dtf(w3dtfdate)

2004

registerDateHandler(_parse_date_mssql)

2005

2006

# Unicode strings for Greek date strings

2007

_greek_months = \

2008

{ \

2009

u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7

2010

u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7

2011

u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7

2012

u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7

2013

u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7

2014

u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7

2015

u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7

2016

u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7

2017

u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7

2018

u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7

2019

u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7

2020

u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7

2021

u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7

2022

u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7

2023

u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7

2024

u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7

2025

u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7

2026

u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7

2027

u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7

2028

}

2029

2030

_greek_wdays = \

2031

{ \

2032

u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7

2033

u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7

2034

u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7

2035

u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7

2036

u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7

2037

u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7

2038

u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7

2039

}

2040

2041

_greek_date_format_re = \

2042

re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')

2043

2044

def _parse_date_greek(dateString):

2045

'''Parse a string according to a Greek 8-bit date format.'''

2046

m = _greek_date_format_re.match(dateString)

2047

if not m: return

2048

try:

2049

wday = _greek_wdays[m.group(1)]

2050

month = _greek_months[m.group(3)]

2051

except:

2052

return

2053

rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \

2054

{'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\

2055

'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\

2056

'zonediff': m.group(8)}

2057

if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)

2058

return _parse_date_rfc822(rfc822date)

2059

registerDateHandler(_parse_date_greek)

2060

2061

# Unicode strings for Hungarian date strings

2062

_hungarian_months = \

2063

{ \

2064

u'janu\u00e1r': u'01', # e1 in iso-8859-2

2065

u'febru\u00e1ri': u'02', # e1 in iso-8859-2

2066

u'm\u00e1rcius': u'03', # e1 in iso-8859-2

2067

u'\u00e1prilis': u'04', # e1 in iso-8859-2

2068

u'm\u00e1ujus': u'05', # e1 in iso-8859-2

2069

u'j\u00fanius': u'06', # fa in iso-8859-2

2070

u'j\u00falius': u'07', # fa in iso-8859-2

2071

u'augusztus': u'08',

2072

u'szeptember': u'09',

2073

u'okt\u00f3ber': u'10', # f3 in iso-8859-2

2074

u'november': u'11',

2075

u'december': u'12',

2076

}

2077

2078

_hungarian_date_format_re = \

2079

re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')

2080

2081

def _parse_date_hungarian(dateString):

2082

'''Parse a string according to a Hungarian 8-bit date format.'''

2083

m = _hungarian_date_format_re.match(dateString)

2084

if not m: return

2085

try:

2086

month = _hungarian_months[m.group(2)]

2087

day = m.group(3)

2088

if len(day) == 1:

2089

day = '0' + day

2090

hour = m.group(4)

2091

if len(hour) == 1:

2092

hour = '0' + hour

2093

except:

2094

return

2095

w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \

2096

{'year': m.group(1), 'month': month, 'day': day,\

2097

'hour': hour, 'minute': m.group(5),\

2098

'zonediff': m.group(6)}

2099

if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)

2100

return _parse_date_w3dtf(w3dtfdate)

2101

registerDateHandler(_parse_date_hungarian)

2102

2103

# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by

2104

# Drake and licensed under the Python license. Removed all range checking

2105

# for month, day, hour, minute, and second, since mktime will normalize

2106

# these later

2107

def _parse_date_w3dtf(dateString):

2108

def __extract_date(m):

2109

year = int(m.group('year'))

2110

if year < 100:

2111

year = 100 * int(time.gmtime()[0] / 100) + int(year)

2112

if year < 1000:

2113

return 0, 0, 0

2114

julian = m.group('julian')

2115

if julian:

2116

julian = int(julian)

2117

month = julian / 30 + 1

2118

day = julian % 30 + 1

2119

jday = None

2120

while jday != julian:

2121

t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))

2122

jday = time.gmtime(t)[-2]

2123

diff = abs(jday - julian)

2124

if jday > julian:

2125

if diff < day:

2126

day = day - diff

2127

else:

2128

month = month - 1

2129

day = 31

2130

elif jday < julian:

2131

if day + diff < 28:

2132

day = day + diff

2133

else:

2134

month = month + 1

2135

return year, month, day

2136

month = m.group('month')

2137

day = 1

2138

if month is None:

2139

month = 1

2140

else:

2141

month = int(month)

2142

day = m.group('day')

2143

if day:

2144

day = int(day)

2145

else:

2146

day = 1

2147

return year, month, day

2148

2149

def __extract_time(m):

2150

if not m:

2151

return 0, 0, 0

2152

hours = m.group('hours')

2153

if not hours:

2154

return 0, 0, 0

2155

hours = int(hours)

2156

minutes = int(m.group('minutes'))

2157

seconds = m.group('seconds')

2158

if seconds:

2159

seconds = int(seconds)

2160

else:

2161

seconds = 0

2162

return hours, minutes, seconds

2163

2164

def __extract_tzd(m):

2165

'''Return the Time Zone Designator as an offset in seconds from UTC.'''

2166

if not m:

2167

return 0

2168

tzd = m.group('tzd')

2169

if not tzd:

2170

return 0

2171

if tzd == 'Z':

2172

return 0

2173

hours = int(m.group('tzdhours'))

2174

minutes = m.group('tzdminutes')

2175

if minutes:

2176

minutes = int(minutes)

2177

else:

2178

minutes = 0

2179

offset = (hours*60 + minutes) * 60

2180

if tzd[0] == '+':

2181

return -offset

2182

return offset

2183

2184

__date_re = ('(?P<year>\d\d\d\d)'

2185

'(?:(?P<dsep>-|)'

2186

'(?:(?P<julian>\d\d\d)'

2187

'|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?')

2188

__tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'

2189

__tzd_rx = re.compile(__tzd_re)

2190

__time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'

2191

'(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?'

2192

+ __tzd_re)

2193

__datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)

2194

__datetime_rx = re.compile(__datetime_re)

2195

m = __datetime_rx.match(dateString)

2196

if (m is None) or (m.group() != dateString): return

2197

gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)

2198

if gmt[0] == 0: return

2199

return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)

2200

registerDateHandler(_parse_date_w3dtf)

2201

2202

def _parse_date_rfc822(dateString):

2203

'''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''

2204

data = dateString.split()

2205

if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:

2206

del data[0]

2207

if len(data) == 4:

2208

s = data[3]

2209

i = s.find('+')

2210

if i > 0:

2211

data[3:] = [s[:i], s[i+1:]]

2212

else:

2213

data.append('')

2214

dateString = " ".join(data)

2215

if len(data) < 5:

2216

dateString += ' 00:00:00 GMT'

2217

tm = rfc822.parsedate_tz(dateString)

2218

if tm:

2219

return time.gmtime(rfc822.mktime_tz(tm))

2220

# rfc822.py defines several time zones, but we define some extra ones.

2221

# 'ET' is equivalent to 'EST', etc.

2222

_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}

2223

rfc822._timezones.update(_additional_timezones)

2224

registerDateHandler(_parse_date_rfc822)

2225

2226

def _parse_date(dateString):

2227

'''Parses a variety of date formats into a 9-tuple in GMT'''

2228

for handler in _date_handlers:

2229

try:

2230

date9tuple = handler(dateString)

2231

if not date9tuple: continue

2232

if len(date9tuple) != 9:

2233

if _debug: sys.stderr.write('date handler function must return 9-tuple\n')

2234

raise ValueError

2235

map(int, date9tuple)

2236

return date9tuple

2237

except Exception, e:

2238

if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))

2239

pass

2240

return None

2241

2242

def _getCharacterEncoding(http_headers, xml_data):

2243

'''Get the character encoding of the XML document

2244

2245

http_headers is a dictionary

2246

xml_data is a raw string (not Unicode)

2247

2248

This is so much trickier than it sounds, it's not even funny.

2249

According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type

2250

is application/xml, application/*+xml,

2251

application/xml-external-parsed-entity, or application/xml-dtd,

2252

the encoding given in the charset parameter of the HTTP Content-Type

2253

takes precedence over the encoding given in the XML prefix within the

2254

document, and defaults to 'utf-8' if neither are specified. But, if

2255

the HTTP Content-Type is text/xml, text/*+xml, or

2256

text/xml-external-parsed-entity, the encoding given in the XML prefix

2257

within the document is ALWAYS IGNORED and only the encoding given in

2258

the charset parameter of the HTTP Content-Type header should be

2259

respected, and it defaults to 'us-ascii' if not specified.

2260

2261

Furthermore, discussion on the atom-syntax mailing list with the

2262

author of RFC 3023 leads me to the conclusion that any document

2263

served with a Content-Type of text/* and no charset parameter

2264

must be treated as us-ascii. (We now do this.) And also that it

2265

must always be flagged as non-well-formed. (We now do this too.)

2266

2267

If Content-Type is unspecified (input was local file or non-HTTP source)

2268

or unrecognized (server just got it totally wrong), then go by the

2269

encoding given in the XML prefix of the document and default to

2270

'iso-8859-1' as per the HTTP specification (RFC 2616).

2271

2272

Then, assuming we didn't find a character encoding in the HTTP headers

2273

(and the HTTP Content-type allowed us to look in the body), we need

2274

to sniff the first few bytes of the XML data and try to determine

2275

whether the encoding is ASCII-compatible. Section F of the XML

2276

specification shows the way here:

2277

http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info

2278

2279

If the sniffed encoding is not ASCII-compatible, we need to make it

2280

ASCII compatible so that we can sniff further into the XML declaration

2281

to find the encoding attribute, which will tell us the true encoding.

2282

2283

Of course, none of this guarantees that we will be able to parse the

2284

feed in the declared character encoding (assuming it was declared

2285

correctly, which many are not). CJKCodecs and iconv_codec help a lot;

2286

you should definitely install them if you can.

2287

http://cjkpython.i18n.org/

2288

'''

2289

2290

def _parseHTTPContentType(content_type):

2291

'''takes HTTP Content-Type header and returns (content type, charset)

2292

2293

If no charset is specified, returns (content type, '')

2294

If no content type is specified, returns ('', '')

2295

Both return parameters are guaranteed to be lowercase strings

2296

'''

2297

content_type = content_type or ''

2298

content_type, params = cgi.parse_header(content_type)

2299

return content_type, params.get('charset', '').replace("'", '')

2300

2301

sniffed_xml_encoding = ''

2302

xml_encoding = ''

2303

true_encoding = ''

2304

http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))

2305

# Must sniff for non-ASCII-compatible character encodings before

2306

# searching for XML declaration. This heuristic is defined in

2307

# section F of the XML specification:

2308

# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info

2309

try:

2310

if xml_data[:4] == '\x4c\x6f\xa7\x94':

2311

# EBCDIC

2312

xml_data = _ebcdic_to_ascii(xml_data)

2313

elif xml_data[:4] == '\x00\x3c\x00\x3f':

2314

# UTF-16BE

2315

sniffed_xml_encoding = 'utf-16be'

2316

xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')

2317

elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):

2318

# UTF-16BE with BOM

2319

sniffed_xml_encoding = 'utf-16be'

2320

xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')

2321

elif xml_data[:4] == '\x3c\x00\x3f\x00':

2322

# UTF-16LE

2323

sniffed_xml_encoding = 'utf-16le'

2324

xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')

2325

elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):

2326

# UTF-16LE with BOM

2327

sniffed_xml_encoding = 'utf-16le'

2328

xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')

2329

elif xml_data[:4] == '\x00\x00\x00\x3c':

2330

# UTF-32BE

2331

sniffed_xml_encoding = 'utf-32be'

2332

xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')

2333

elif xml_data[:4] == '\x3c\x00\x00\x00':

2334

# UTF-32LE

2335

sniffed_xml_encoding = 'utf-32le'

2336

xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')

2337

elif xml_data[:4] == '\x00\x00\xfe\xff':

2338

# UTF-32BE with BOM

2339

sniffed_xml_encoding = 'utf-32be'

2340

xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')

2341

elif xml_data[:4] == '\xff\xfe\x00\x00':

2342

# UTF-32LE with BOM

2343

sniffed_xml_encoding = 'utf-32le'

2344

xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')

2345

elif xml_data[:3] == '\xef\xbb\xbf':

2346

# UTF-8 with BOM

2347

sniffed_xml_encoding = 'utf-8'

2348

xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')

2349

else:

2350

# ASCII-compatible

2351

pass

2352

xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)

2353

except:

2354

xml_encoding_match = None

2355

if xml_encoding_match:

2356

xml_encoding = xml_encoding_match.groups()[0].lower()

2357

if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):

2358

xml_encoding = sniffed_xml_encoding

2359

acceptable_content_type = 0

2360

application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')

2361

text_content_types = ('text/xml', 'text/xml-external-parsed-entity')

2362

if (http_content_type in application_content_types) or \

2363

(http_content_type.startswith('application/') and http_content_type.endswith('+xml')):

2364

acceptable_content_type = 1

2365

true_encoding = http_encoding or xml_encoding or 'utf-8'

2366

elif (http_content_type in text_content_types) or \

2367

(http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):

2368

acceptable_content_type = 1

2369

true_encoding = http_encoding or 'us-ascii'

2370

elif http_content_type.startswith('text/'):

2371

true_encoding = http_encoding or 'us-ascii'

2372

elif http_headers and (not http_headers.has_key('content-type')):

2373

true_encoding = xml_encoding or 'iso-8859-1'

2374

else:

2375

true_encoding = xml_encoding or 'utf-8'

2376

return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type

2377

2378

def _toUTF8(data, encoding):

2379

'''Changes an XML data stream on the fly to specify a new encoding

2380

2381

data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already

2382

encoding is a string recognized by encodings.aliases

2383

'''

2384

if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)

2385

# strip Byte Order Mark (if present)

2386

if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):

2387

if _debug:

2388

sys.stderr.write('stripping BOM\n')

2389

if encoding != 'utf-16be':

2390

sys.stderr.write('trying utf-16be instead\n')

2391

encoding = 'utf-16be'

2392

data = data[2:]

2393

elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):

2394

if _debug:

2395

sys.stderr.write('stripping BOM\n')

2396

if encoding != 'utf-16le':

2397

sys.stderr.write('trying utf-16le instead\n')

2398

encoding = 'utf-16le'

2399

data = data[2:]

2400

elif data[:3] == '\xef\xbb\xbf':

2401

if _debug:

2402

sys.stderr.write('stripping BOM\n')

2403

if encoding != 'utf-8':

2404

sys.stderr.write('trying utf-8 instead\n')

2405

encoding = 'utf-8'

2406

data = data[3:]

2407

elif data[:4] == '\x00\x00\xfe\xff':

2408

if _debug:

2409

sys.stderr.write('stripping BOM\n')

2410

if encoding != 'utf-32be':

2411

sys.stderr.write('trying utf-32be instead\n')

2412

encoding = 'utf-32be'

2413

data = data[4:]

2414

elif data[:4] == '\xff\xfe\x00\x00':

2415

if _debug:

2416

sys.stderr.write('stripping BOM\n')

2417

if encoding != 'utf-32le':

2418

sys.stderr.write('trying utf-32le instead\n')

2419

encoding = 'utf-32le'

2420

data = data[4:]

2421

newdata = unicode(data, encoding)

2422

if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)

2423

declmatch = re.compile('^<\?xml[^>]*?>')

2424

newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''

2425

if declmatch.search(newdata):

2426

newdata = declmatch.sub(newdecl, newdata)

2427

else:

2428

newdata = newdecl + u'\n' + newdata

2429

return newdata.encode('utf-8')

2430

2431

def _stripDoctype(data):

2432

'''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)

2433

2434

rss_version may be 'rss091n' or None

2435

stripped_data is the same XML document, minus the DOCTYPE

2436

'''

2437

entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE)

2438

data = entity_pattern.sub('', data)

2439

doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)

2440

doctype_results = doctype_pattern.findall(data)

2441

doctype = doctype_results and doctype_results[0] or ''

2442

if doctype.lower().count('netscape'):

2443

version = 'rss091n'

2444

else:

2445

version = None

2446

data = doctype_pattern.sub('', data)

2447

return version, data

2448

2449

def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):

2450

'''Parse a feed from a URL, file, stream, or string'''

2451

result = FeedParserDict()

2452

result['feed'] = FeedParserDict()

2453

result['entries'] = []

2454

if _XML_AVAILABLE:

2455

result['bozo'] = 0

2456

if type(handlers) == types.InstanceType:

2457

handlers = [handlers]

2458

try:

2459

f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)

2460

data = f.read()

2461

except Exception, e:

2462

result['bozo'] = 1

2463

result['bozo_exception'] = e

2464

data = ''

2465

f = None

2466

2467

# if feed is gzip-compressed, decompress it

2468

if f and data and hasattr(f, 'headers'):

2469

if gzip and f.headers.get('content-encoding', '') == 'gzip':

2470

try:

2471

data = gzip.GzipFile(fileobj=_StringIO(data)).read()

2472

except Exception, e:

2473

# Some feeds claim to be gzipped but they're not, so

2474

# we get garbage. Ideally, we should re-request the

2475

# feed without the 'Accept-encoding: gzip' header,

2476

# but we don't.

2477

result['bozo'] = 1

2478

result['bozo_exception'] = e

2479

data = ''

2480

elif zlib and f.headers.get('content-encoding', '') == 'deflate':

2481

try:

2482

data = zlib.decompress(data, -zlib.MAX_WBITS)

2483

except Exception, e:

2484

result['bozo'] = 1

2485

result['bozo_exception'] = e

2486

data = ''

2487

2488

# save HTTP headers

2489

if hasattr(f, 'info'):

2490

info = f.info()

2491

result['etag'] = info.getheader('ETag')

2492

last_modified = info.getheader('Last-Modified')

2493

if last_modified:

2494

result['modified'] = _parse_date(last_modified)

2495

if hasattr(f, 'url'):

2496

result['href'] = f.url

2497

result['status'] = 200

2498

if hasattr(f, 'status'):

2499

result['status'] = f.status

2500

if hasattr(f, 'headers'):

2501

result['headers'] = f.headers.dict

2502

if hasattr(f, 'close'):

2503

f.close()

2504

2505

# there are four encodings to keep track of:

2506

# - http_encoding is the encoding declared in the Content-Type HTTP header

2507

# - xml_encoding is the encoding declared in the <?xml declaration

2508

# - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data

2509

# - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications

2510

http_headers = result.get('headers', {})

2511

result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \

2512

_getCharacterEncoding(http_headers, data)

2513

if http_headers and (not acceptable_content_type):

2514

if http_headers.has_key('content-type'):

2515

bozo_message = '%s is not an XML media type' % http_headers['content-type']

2516

else:

2517

bozo_message = 'no Content-type specified'

2518

result['bozo'] = 1

2519

result['bozo_exception'] = NonXMLContentType(bozo_message)

2520

2521

result['version'], data = _stripDoctype(data)

2522

2523

baseuri = http_headers.get('content-location', result.get('href'))

2524

baselang = http_headers.get('content-language', None)

2525

2526

# if server sent 304, we're done

2527

if result.get('status', 0) == 304:

2528

result['version'] = ''

2529

result['debug_message'] = 'The feed has not changed since you last checked, ' + \

2530

'so the server sent no data. This is a feature, not a bug!'

2531

return result

2532

2533

# if there was a problem downloading, we're done

2534

if not data:

2535

return result

2536

2537

# determine character encoding

2538

use_strict_parser = 0

2539

known_encoding = 0

2540

tried_encodings = []

2541

# try: HTTP encoding, declared XML encoding, encoding sniffed from BOM

2542

for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):

2543

if not proposed_encoding: continue

2544

if proposed_encoding in tried_encodings: continue

2545

tried_encodings.append(proposed_encoding)

2546

try:

2547

data = _toUTF8(data, proposed_encoding)

2548

known_encoding = use_strict_parser = 1

2549

break

2550

except:

2551

pass

2552

# if no luck and we have auto-detection library, try that

2553

if (not known_encoding) and chardet:

2554

try:

2555

proposed_encoding = chardet.detect(data)['encoding']

2556

if proposed_encoding and (proposed_encoding not in tried_encodings):

2557

tried_encodings.append(proposed_encoding)

2558

data = _toUTF8(data, proposed_encoding)

2559

known_encoding = use_strict_parser = 1

2560

except:

2561

pass

2562

# if still no luck and we haven't tried utf-8 yet, try that

2563

if (not known_encoding) and ('utf-8' not in tried_encodings):

2564

try:

2565

proposed_encoding = 'utf-8'

2566

tried_encodings.append(proposed_encoding)

2567

data = _toUTF8(data, proposed_encoding)

2568

known_encoding = use_strict_parser = 1

2569

except:

2570

pass

2571

# if still no luck and we haven't tried windows-1252 yet, try that

2572

if (not known_encoding) and ('windows-1252' not in tried_encodings):

2573

try:

2574

proposed_encoding = 'windows-1252'

2575

tried_encodings.append(proposed_encoding)

2576

data = _toUTF8(data, proposed_encoding)

2577

known_encoding = use_strict_parser = 1

2578

except:

2579

pass

2580

# if still no luck, give up

2581

if not known_encoding:

2582

result['bozo'] = 1

2583

result['bozo_exception'] = CharacterEncodingUnknown( \

2584

'document encoding unknown, I tried ' + \

2585

'%s, %s, utf-8, and windows-1252 but nothing worked' % \

2586

(result['encoding'], xml_encoding))

2587

result['encoding'] = ''

2588

elif proposed_encoding != result['encoding']:

2589

result['bozo'] = 1

2590

result['bozo_exception'] = CharacterEncodingOverride( \

2591

'documented declared as %s, but parsed as %s' % \

2592

(result['encoding'], proposed_encoding))

2593

result['encoding'] = proposed_encoding

2594

2595

if not _XML_AVAILABLE:

2596

use_strict_parser = 0

2597

if use_strict_parser:

2598

# initialize the SAX parser

2599

feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')

2600

saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)

2601

saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)

2602

saxparser.setContentHandler(feedparser)

2603

saxparser.setErrorHandler(feedparser)

2604

source = xml.sax.xmlreader.InputSource()

2605

source.setByteStream(_StringIO(data))

2606

if hasattr(saxparser, '_ns_stack'):

2607

# work around bug in built-in SAX parser (doesn't recognize xml: namespace)

2608

# PyXML doesn't have this problem, and it doesn't have _ns_stack either

2609

saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})

2610

try:

2611

saxparser.parse(source)

2612

except Exception, e:

2613

if _debug:

2614

import traceback

2615

traceback.print_stack()

2616

traceback.print_exc()

2617

sys.stderr.write('xml parsing failed\n')

2618

result['bozo'] = 1

2619

result['bozo_exception'] = feedparser.exc or e

2620

use_strict_parser = 0

2621

if not use_strict_parser:

2622

feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '')

2623

feedparser.feed(data)

2624

result['feed'] = feedparser.feeddata

2625

result['entries'] = feedparser.entries

2626

result['version'] = result['version'] or feedparser.version

2627

result['namespaces'] = feedparser.namespacesInUse

2628

return result

2629

2630

if __name__ == '__main__':

2631

if not sys.argv[1:]:

2632

print __doc__

2633

sys.exit(0)

2634

else:

2635

urls = sys.argv[1:]

2636

zopeCompatibilityHack()

2637

from pprint import pprint

2638

for url in urls:

2639

print url

2640

print

2641

result = parse(url)

2642

pprint(result)

2643

print

2644

2645

#REVISION HISTORY

2646

#1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,

2647

# added Simon Fell's test suite

2648

#1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections

2649

#2.0 - 10/19/2002

2650

# JD - use inchannel to watch out for image and textinput elements which can

2651

# also contain title, link, and description elements

2652

# JD - check for isPermaLink='false' attribute on guid elements

2653

# JD - replaced openAnything with open_resource supporting ETag and

2654

# If-Modified-Since request headers

2655

# JD - parse now accepts etag, modified, agent, and referrer optional

2656

# arguments

2657

# JD - modified parse to return a dictionary instead of a tuple so that any

2658

# etag or modified information can be returned and cached by the caller

2659

#2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything

2660

# because of etag/modified, return the old etag/modified to the caller to

2661

# indicate why nothing is being returned

2662

#2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its

2663

# useless. Fixes the problem JD was addressing by adding it.

2664

#2.1 - 11/14/2002 - MAP - added gzip support

2665

#2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.

2666

# start_admingeneratoragent is an example of how to handle elements with

2667

# only attributes, no content.

2668

#2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);

2669

# also, make sure we send the User-Agent even if urllib2 isn't available.

2670

# Match any variation of backend.userland.com/rss namespace.

2671

#2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.

2672

#2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's

2673

# snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed

2674

# project name

2675

#2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);

2676

# removed unnecessary urllib code -- urllib2 should always be available anyway;

2677

# return actual url, status, and full HTTP headers (as result['url'],

2678

# result['status'], and result['headers']) if parsing a remote feed over HTTP --

2679

# this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;

2680

# added the latest namespace-of-the-week for RSS 2.0

2681

#2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom

2682

# User-Agent (otherwise urllib2 sends two, which confuses some servers)

2683

#2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for

2684

# inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds

2685

#2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or

2686

# textInput, and also to return the character encoding (if specified)

2687

#2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking

2688

# nested divs within content (JohnD); fixed missing sys import (JohanS);

2689

# fixed regular expression to capture XML character encoding (Andrei);

2690

# added support for Atom 0.3-style links; fixed bug with textInput tracking;

2691

# added support for cloud (MartijnP); added support for multiple

2692

# category/dc:subject (MartijnP); normalize content model: 'description' gets

2693

# description (which can come from description, summary, or full content if no

2694

# description), 'content' gets dict of base/language/type/value (which can come

2695

# from content:encoded, xhtml:body, content, or fullitem);

2696

# fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang

2697

# tracking; fixed bug tracking unknown tags; fixed bug tracking content when

2698

# <content> element is not in default namespace (like Pocketsoap feed);

2699

# resolve relative URLs in link, guid, docs, url, comments, wfw:comment,

2700

# wfw:commentRSS; resolve relative URLs within embedded HTML markup in

2701

# description, xhtml:body, content, content:encoded, title, subtitle,

2702

# summary, info, tagline, and copyright; added support for pingback and

2703

# trackback namespaces

2704

#2.7 - 1/5/2004 - MAP - really added support for trackback and pingback

2705

# namespaces, as opposed to 2.6 when I said I did but didn't really;

2706

# sanitize HTML markup within some elements; added mxTidy support (if

2707

# installed) to tidy HTML markup within some elements; fixed indentation

2708

# bug in _parse_date (FazalM); use socket.setdefaulttimeout if available

2709

# (FazalM); universal date parsing and normalization (FazalM): 'created', modified',

2710

# 'issued' are parsed into 9-tuple date format and stored in 'created_parsed',

2711

# 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'

2712

# and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa

2713

#2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory

2714

# leak not closing url opener (JohnD); added dc:publisher support (MarekK);

2715

# added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)

2716

#2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in

2717

# encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);

2718

# fixed relative URI processing for guid (skadz); added ICBM support; added

2719

# base64 support

2720

#2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many

2721

# blogspot.com sites); added _debug variable

2722

#2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing

2723

#3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);

2724

# added several new supported namespaces; fixed bug tracking naked markup in

2725

# description; added support for enclosure; added support for source; re-added

2726

# support for cloud which got dropped somehow; added support for expirationDate

2727

#3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking

2728

# xml:base URI, one for documents that don't define one explicitly and one for

2729

# documents that define an outer and an inner xml:base that goes out of scope

2730

# before the end of the document

2731

#3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level

2732

#3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version']

2733

# will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;

2734

# added support for creativeCommons:license and cc:license; added support for

2735

# full Atom content model in title, tagline, info, copyright, summary; fixed bug

2736

# with gzip encoding (not always telling server we support it when we do)

2737

#3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail

2738

# (dictionary of 'name', 'url', 'email'); map author to author_detail if author

2739

# contains name + email address

2740

#3.0b8 - 1/28/2004 - MAP - added support for contributor

2741

#3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added

2742

# support for summary

2743

#3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from

2744

# xml.util.iso8601

2745

#3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain

2746

# dangerous markup; fiddled with decodeEntities (not right); liberalized

2747

# date parsing even further

2748

#3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);

2749

# added support to Atom 0.2 subtitle; added support for Atom content model

2750

# in copyright; better sanitizing of dangerous HTML elements with end tags

2751

# (script, frameset)

2752

#3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,

2753

# etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />)

2754

#3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under

2755

# Python 2.1

2756

#3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;

2757

# fixed bug capturing author and contributor URL; fixed bug resolving relative

2758

# links in author and contributor URL; fixed bug resolvin relative links in

2759

# generator URL; added support for recognizing RSS 1.0; passed Simon Fell's

2760

# namespace tests, and included them permanently in the test suite with his

2761

# permission; fixed namespace handling under Python 2.1

2762

#3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)

2763

#3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023

2764

#3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);

2765

# use libxml2 (if available)

2766

#3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author

2767

# name was in parentheses; removed ultra-problematic mxTidy support; patch to

2768

# workaround crash in PyXML/expat when encountering invalid entities

2769

# (MarkMoraes); support for textinput/textInput

2770

#3.0b20 - 4/7/2004 - MAP - added CDF support

2771

#3.0b21 - 4/14/2004 - MAP - added Hot RSS support

2772

#3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in

2773

# results dict; changed results dict to allow getting values with results.key

2774

# as well as results[key]; work around embedded illformed HTML with half

2775

# a DOCTYPE; work around malformed Content-Type header; if character encoding

2776

# is wrong, try several common ones before falling back to regexes (if this

2777

# works, bozo_exception is set to CharacterEncodingOverride); fixed character

2778

# encoding issues in BaseHTMLProcessor by tracking encoding and converting

2779

# from Unicode to raw strings before feeding data to sgmllib.SGMLParser;

2780

# convert each value in results to Unicode (if possible), even if using

2781

# regex-based parsing

2782

#3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain

2783

# high-bit characters in attributes in embedded HTML in description (thanks

2784

# Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in

2785

# FeedParserDict; tweaked FeedParserDict.has_key to return True if asking

2786

# about a mapped key

2787

#3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and

2788

# results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could

2789

# cause the same encoding to be tried twice (even if it failed the first time);

2790

# fixed DOCTYPE stripping when DOCTYPE contained entity declarations;

2791

# better textinput and image tracking in illformed RSS 1.0 feeds

2792

#3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed

2793

# my blink tag tests

2794

#3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that

2795

# failed to parse utf-16 encoded feeds; made source into a FeedParserDict;

2796

# duplicate admin:generatorAgent/@rdf:resource in generator_detail.url;

2797

# added support for image; refactored parse() fallback logic to try other

2798

# encodings if SAX parsing fails (previously it would only try other encodings

2799

# if re-encoding failed); remove unichr madness in normalize_attrs now that

2800

# we're properly tracking encoding in and out of BaseHTMLProcessor; set

2801

# feed.language from root-level xml:lang; set entry.id from rdf:about;

2802

# send Accept header

2803

#3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between

2804

# iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are

2805

# windows-1252); fixed regression that could cause the same encoding to be

2806

# tried twice (even if it failed the first time)

2807

#3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types;

2808

# recover from malformed content-type header parameter with no equals sign

2809

# ('text/xml; charset:iso-8859-1')

2810

#3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities

2811

# to Unicode equivalents in illformed feeds (aaronsw); added and

2812

# passed tests for converting character entities to Unicode equivalents

2813

# in illformed feeds (aaronsw); test for valid parsers when setting

2814

# XML_AVAILABLE; make version and encoding available when server returns

2815

# a 304; add handlers parameter to pass arbitrary urllib2 handlers (like

2816

# digest auth or proxy support); add code to parse username/password

2817

# out of url and send as basic authentication; expose downloading-related

2818

# exceptions in bozo_exception (aaronsw); added __contains__ method to

2819

# FeedParserDict (aaronsw); added publisher_detail (aaronsw)

2820

#3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always

2821

# convert feed to UTF-8 before passing to XML parser; completely revamped

2822

# logic for determining character encoding and attempting XML parsing

2823

# (much faster); increased default timeout to 20 seconds; test for presence

2824

# of Location header on redirects; added tests for many alternate character

2825

# encodings; support various EBCDIC encodings; support UTF-16BE and

2826

# UTF16-LE with or without a BOM; support UTF-8 with a BOM; support

2827

# UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no

2828

# XML parsers are available; added support for 'Content-encoding: deflate';

2829

# send blank 'Accept-encoding: ' header if neither gzip nor zlib modules

2830

# are available

2831

#3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure

2832

# problem tracking xml:base and xml:lang if element declares it, child

2833

# doesn't, first grandchild redeclares it, and second grandchild doesn't;

2834

# refactored date parsing; defined public registerDateHandler so callers

2835

# can add support for additional date formats at runtime; added support

2836

# for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added

2837

# zopeCompatibilityHack() which turns FeedParserDict into a regular

2838

# dictionary, required for Zope compatibility, and also makes command-

2839

# line debugging easier because pprint module formats real dictionaries

2840

# better than dictionary-like objects; added NonXMLContentType exception,

2841

# which is stored in bozo_exception when a feed is served with a non-XML

2842

# media type such as 'text/plain'; respect Content-Language as default

2843

# language if not xml:lang is present; cloud dict is now FeedParserDict;

2844

# generator dict is now FeedParserDict; better tracking of xml:lang,

2845

# including support for xml:lang='' to unset the current language;

2846

# recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default

2847

# namespace; don't overwrite final status on redirects (scenarios:

2848

# redirecting to a URL that returns 304, redirecting to a URL that

2849

# redirects to another URL with a different type of redirect); add

2850

# support for HTTP 303 redirects

2851

#4.0 - MAP - support for relative URIs in xml:base attribute; fixed

2852

# encoding issue with mxTidy (phopkins); preliminary support for RFC 3229;

2853

# support for Atom 1.0; support for iTunes extensions; new 'tags' for

2854

# categories/keywords/etc. as array of dict

2855

# {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0

2856

# terminology; parse RFC 822-style dates with no time; lots of other

2857

# bug fixes

2858

#4.1 - MAP - removed socket timeout; added support for chardet library