1
"""Universal feed parser
3
Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
5
Visit http://feedparser.org/ for the latest version
6
Visit http://feedparser.org/docs/ for the latest documentation
8
Required: Python 2.1 or later
9
Recommended: Python 2.3 or later
10
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
13
__version__ = "4.1"# + "$Revision$"[11:15] + "-cvs"
14
__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
16
Redistribution and use in source and binary forms, with or without modification,
17
are permitted provided that the following conditions are met:
19
* Redistributions of source code must retain the above copyright notice,
20
this list of conditions and the following disclaimer.
21
* Redistributions in binary form must reproduce the above copyright notice,
22
this list of conditions and the following disclaimer in the documentation
23
and/or other materials provided with the distribution.
25
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
26
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
29
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35
POSSIBILITY OF SUCH DAMAGE.
37
__author__ = "Mark Pilgrim <http://diveintomark.org/>"
38
__contributors__ = ["Jason Diamond <http://injektilo.org/>",
39
"John Beimler <http://john.beimler.org/>",
40
"Fazal Majid <http://www.majid.info/mylos/weblog/>",
41
"Aaron Swartz <http://aaronsw.com/>",
42
"Kevin Marks <http://epeus.blogspot.com/>"]
45
# HTTP "User-Agent" header to send to servers when downloading feeds.
46
# If you are embedding feedparser in a larger application, you should
47
# change this to your application name and URL.
48
USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
49
from miro import config
50
from miro import prefs
51
USER_AGENT += " %s/%s (%s)" % \
52
(config.get(prefs.SHORT_APP_NAME),
53
config.get(prefs.APP_VERSION),
54
config.get(prefs.PROJECT_URL))
56
# HTTP "Accept" header to send to servers when downloading feeds. If you don't
57
# want to send an Accept header, set this to None.
58
ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
60
# List of preferred XML parsers, by SAX driver name. These will be tried first,
61
# but if they're not installed, Python will keep searching through its own list
62
# of pre-installed parsers until it finds one that supports everything we need.
63
PREFERRED_XML_PARSERS = ["drv_libxml2"]
65
# If you want feedparser to automatically run HTML markup through HTML Tidy, set
66
# this to 1. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
67
# or utidylib <http://utidylib.berlios.de/>.
70
# List of Python interfaces for HTML Tidy, in order of preference. Only useful
72
PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
74
# ---------- required modules (should come with any Python distribution) ----------
75
import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
77
from cStringIO import StringIO as _StringIO
79
from StringIO import StringIO as _StringIO
81
# ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
83
# gzip is included with most Python distributions, but may not be available if you compiled your own
93
# If a real XML parser is available, feedparser will attempt to use it. feedparser has
94
# been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the
95
# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
96
# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
99
xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
100
from xml.sax.saxutils import escape as _xmlescape
104
def _xmlescape(data):
105
data = data.replace('&', '&')
106
data = data.replace('>', '>')
107
data = data.replace('<', '<')
110
# base64 support for Atom feeds that contain embedded binary data
112
import base64, binascii
114
base64 = binascii = None
116
# cjkcodecs and iconv_codec provide support for more character encodings.
117
# Both are available from http://cjkpython.i18n.org/
119
import cjkcodecs.aliases
127
# chardet library auto-detects character encodings
128
# Download from http://chardet.feedparser.org/
132
import chardet.constants
133
chardet.constants._debug = 1
137
# ---------- don't touch these ----------
138
class ThingsNobodyCaresAboutButMe(Exception): pass
139
class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
140
class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
141
class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
142
class UndeclaredNamespace(Exception): pass
144
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
145
sgmllib.special = re.compile('<!')
146
sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]')
148
SUPPORTED_VERSIONS = {'': 'unknown',
149
'rss090': 'RSS 0.90',
150
'rss091n': 'RSS 0.91 (Netscape)',
151
'rss091u': 'RSS 0.91 (Userland)',
152
'rss092': 'RSS 0.92',
153
'rss093': 'RSS 0.93',
154
'rss094': 'RSS 0.94',
157
'rss': 'RSS (unknown version)',
158
'atom01': 'Atom 0.1',
159
'atom02': 'Atom 0.2',
160
'atom03': 'Atom 0.3',
161
'atom10': 'Atom 1.0',
162
'atom': 'Atom (unknown version)',
170
# Python 2.1 does not have dict
171
from UserDict import UserDict
178
def _entry_equal(a, b):
179
if type(a) == list and type(b) == list:
182
for i in xrange (len(a)):
183
if not _entry_equal(a[i], b[i]):
188
except (SystemExit, KeyboardInterrupt):
193
except (SystemExit, KeyboardInterrupt):
198
class FeedParserDict(UserDict):
199
# This is a complete hack to prevent problems if data is saved with a
200
# newer version of Miro and an older version of Miro tries to open it.
201
# See storedatabase.py for more info.
202
__module__ = 'feedparser'
204
keymap = {'channel': 'feed',
207
'length': 'filesize',
208
'image': 'thumbnail',
210
'date_parsed': 'updated_parsed',
211
'description': ('subtitle', 'summary'),
213
'modified': 'updated',
214
'modified_parsed': 'updated_parsed',
215
'issued': 'published',
216
'issued_parsed': 'published_parsed',
217
'copyright': 'rights',
218
'copyright_detail': 'rights_detail',
219
'tagline': 'subtitle',
220
'tagline_detail': 'subtitle_detail'}
224
if isinstance(keymap[key], tuple):
225
for k in keymap[key]:
226
reverse_keymap[k] = key
228
reverse_keymap[keymap[key]] = key
230
def __init__(self, initialData=None):
231
if isinstance(initialData, dict):
232
UserDict.__init__(self)
233
for key in initialData:
234
self[key] = initialData[key]
235
elif initialData is not None:
236
UserDict.__init__(self, initialData)
238
UserDict.__init__(self)
240
def reverse_key (self, key):
241
if self.reverse_keymap.has_key(key):
242
return self.reverse_keymap[key]
248
def __init__ (self, container):
249
self.container = container
250
self.subiter = UserDict.__iter__(container)
254
return self.container.reverse_key(self.subiter.next())
255
return ExtendedIter (self)
257
def equal(self, other):
259
iter = other.get_iter()
260
except StandardError:
261
iter = other.__iter__()
265
if not _entry_equal(self[key], other[key]):
268
for key in self.get_iter():
269
if not checked.has_key(key):
272
except StandardError:
275
def __getitem__(self, key):
276
if key == 'category':
277
return UserDict.__getitem__(self, 'tags')[0]['term']
278
if key == 'categories':
279
return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
280
realkey = self.keymap.get(key, key)
281
if isinstance(realkey, tuple):
283
if UserDict.has_key(self, k):
284
return UserDict.__getitem__(self, k)
285
if UserDict.has_key(self, key):
286
return UserDict.__getitem__(self, key)
287
return UserDict.__getitem__(self, realkey)
289
def __setitem__(self, key, value):
290
for k in self.keymap.keys():
293
if isinstance(key, tuple):
295
return UserDict.__setitem__(self, key, value)
297
def get(self, key, default=None):
298
if self.has_key(key):
303
def setdefault(self, key, value):
304
if not self.has_key(key):
308
def has_key(self, key):
310
return hasattr(self, key) or UserDict.has_key(self, key)
311
except AttributeError:
314
def __getattr__(self, key):
316
assert not key.startswith('_')
317
return self.__getitem__(key)
318
except (SystemExit, KeyboardInterrupt):
321
raise AttributeError, "object has no attribute '%s'" % key
323
def __setattr__(self, key, value):
324
if key.startswith('_') or key == 'data':
325
self.__dict__[key] = value
327
return self.__setitem__(key, value)
329
def __contains__(self, key):
330
return self.has_key(key)
332
def zopeCompatibilityHack():
333
global FeedParserDict
335
def FeedParserDict(aDict=None):
341
_ebcdic_to_ascii_map = None
342
def _ebcdic_to_ascii(s):
343
global _ebcdic_to_ascii_map
344
if not _ebcdic_to_ascii_map:
346
0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
347
16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
348
128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
349
144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
350
32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
351
38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
352
45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
353
186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
354
195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
355
202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
356
209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
357
216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
358
123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
359
125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
360
92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
361
48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
364
_ebcdic_to_ascii_map = string.maketrans( \
365
''.join(map(chr, range(256))), ''.join(map(chr, emap)))
366
return s.translate(_ebcdic_to_ascii_map)
368
_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
369
def _urljoin(base, uri):
370
uri = _urifixer.sub(r'\1\3', uri)
371
return urlparse.urljoin(base, uri)
373
class _FeedParserMixin:
374
namespaces = {'': '',
375
'http://backend.userland.com/rss': '',
376
'http://blogs.law.harvard.edu/tech/rss': '',
377
'http://purl.org/rss/1.0/': '',
378
'http://my.netscape.com/rdf/simple/0.9/': '',
379
'http://example.com/newformat#': '',
380
'http://example.com/necho': '',
381
'http://purl.org/echo/': '',
382
'uri/of/echo/namespace#': '',
383
'http://purl.org/pie/': '',
384
'http://purl.org/atom/ns#': '',
385
'http://www.w3.org/2005/Atom': '',
386
'http://purl.org/rss/1.0/modules/rss091#': '',
388
'http://webns.net/mvcb/': 'admin',
389
'http://purl.org/rss/1.0/modules/aggregation/': 'ag',
390
'http://purl.org/rss/1.0/modules/annotate/': 'annotate',
391
'http://media.tangent.org/rss/1.0/': 'audio',
392
'http://backend.userland.com/blogChannelModule': 'blogChannel',
393
'http://web.resource.org/cc/': 'cc',
394
'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
395
'http://purl.org/rss/1.0/modules/company': 'co',
396
'http://purl.org/rss/1.0/modules/content/': 'content',
397
'http://my.theinfo.org/changed/1.0/rss/': 'cp',
398
'http://purl.org/dc/elements/1.1/': 'dc',
399
'http://purl.org/dc/terms/': 'dcterms',
400
'http://purl.org/rss/1.0/modules/email/': 'email',
401
'http://purl.org/rss/1.0/modules/event/': 'ev',
402
'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner',
403
'http://freshmeat.net/rss/fm/': 'fm',
404
'http://xmlns.com/foaf/0.1/': 'foaf',
405
'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo',
406
'http://postneo.com/icbm/': 'icbm',
407
'http://purl.org/rss/1.0/modules/image/': 'image',
408
'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes',
409
'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes',
410
'http://purl.org/rss/1.0/modules/link/': 'l',
411
'http://search.yahoo.com/mrss': 'media',
412
'http://search.yahoo.com/mrss/': 'media',
413
'http://docs.yahoo.com/mediaModule': 'media',
414
'http://tools.search.yahoo.com/mrss/': 'media',
415
'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
416
'http://prismstandard.org/namespaces/1.2/basic/': 'prism',
417
'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf',
418
'http://www.w3.org/2000/01/rdf-schema#': 'rdfs',
419
'http://purl.org/rss/1.0/modules/reference/': 'ref',
420
'http://purl.org/rss/1.0/modules/richequiv/': 'reqv',
421
'http://purl.org/rss/1.0/modules/search/': 'search',
422
'http://purl.org/rss/1.0/modules/slash/': 'slash',
423
'http://schemas.xmlsoap.org/soap/envelope/': 'soap',
424
'http://purl.org/rss/1.0/modules/servicestatus/': 'ss',
425
'http://hacks.benhammersley.com/rss/streaming/': 'str',
426
'http://purl.org/rss/1.0/modules/subscription/': 'sub',
427
'http://purl.org/rss/1.0/modules/syndication/': 'sy',
428
'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo',
429
'http://purl.org/rss/1.0/modules/threading/': 'thr',
430
'http://purl.org/rss/1.0/modules/textinput/': 'ti',
431
'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
432
'http://wellformedweb.org/commentAPI/': 'wfw',
433
'http://purl.org/rss/1.0/modules/wiki/': 'wiki',
434
'http://www.w3.org/1999/xhtml': 'xhtml',
435
'http://www.w3.org/XML/1998/namespace': 'xml',
436
'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf',
437
"http://participatoryculture.org/RSSModules/dtv/1.0": 'dtv'
439
_matchnamespaces = {}
441
can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
442
can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
443
can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
444
html_types = ['text/html', 'application/xhtml+xml']
446
def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
447
if _debug: sys.stderr.write('initializing FeedParser\n')
448
if not self._matchnamespaces:
449
for k, v in self.namespaces.items():
450
self._matchnamespaces[k.lower()] = v
451
self.feeddata = FeedParserDict() # feed-level data
452
self.encoding = encoding # character encoding
453
self.entries = [] # list of entry-level data
454
self.version = '' # feed type/version, see SUPPORTED_VERSIONS
455
self.namespacesInUse = {} # dictionary of namespaces defined by the feed
457
# the following are used internally to track state;
458
# this is really out of control and should be refactored
465
self.incontributor = 0
469
self.sourcedata = FeedParserDict()
470
self.contentparams = FeedParserDict()
471
self._summaryKey = None
472
self.namespacemap = {}
473
self.elementstack = []
476
self.baseuri = baseuri or ''
477
self.lang = baselang or None
479
self.feeddata['language'] = baselang
481
def unknown_starttag(self, tag, attrs):
482
if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
484
attrs = [(k.lower(), v) for k, v in attrs]
485
attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
487
# track xml:base and xml:lang
488
attrsD = FeedParserDict(attrs)
489
baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
490
self.baseuri = _urljoin(self.baseuri, baseuri)
491
lang = attrsD.get('xml:lang', attrsD.get('lang'))
493
# xml:lang could be explicitly set to '', we need to capture that
496
# if no xml:lang is specified, use parent lang
499
if tag in ('feed', 'rss', 'rdf:RDF'):
500
self.feeddata['language'] = lang
502
self.basestack.append(self.baseuri)
503
self.langstack.append(lang)
506
for prefix, uri in attrs:
507
if prefix.startswith('xmlns:'):
508
self.trackNamespace(prefix[6:], uri)
509
elif prefix == 'xmlns':
510
self.trackNamespace(None, uri)
512
# track inline content
513
if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
514
# element declared itself as escaped markup, but it isn't really
515
self.contentparams['type'] = 'application/xhtml+xml'
516
if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
517
# Note: probably shouldn't simply recreate localname here, but
518
# our namespace handling isn't actually 100% correct in cases where
519
# the feed redefines the default namespace (which is actually
520
# the usual case for inline content, thanks Sam), so here we
521
# cheat and just reconstruct the element based on localname
522
# because that compensates for the bugs in our namespace handling.
523
# This will horribly munge inline content with non-empty qnames,
524
# but nobody actually does that, so I'm not fixing it.
525
tag = tag.split(':')[-1]
526
return self.handle_data('<%s%s>' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0)
529
if tag.find(':') <> -1:
530
prefix, suffix = tag.split(':', 1)
532
prefix, suffix = '', tag
533
prefix = self.namespacemap.get(prefix, prefix)
535
prefix = prefix + '_'
537
# special hack for better tracking of empty textinput/image elements in illformed feeds
538
if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
540
if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
543
# call special handler (if defined) or default handler
544
methodname = '_start_' + prefix + suffix
546
method = getattr(self, methodname)
547
return method(attrsD)
548
except AttributeError:
549
return self.push(prefix + suffix, 1)
551
def unknown_endtag(self, tag):
552
if _debug: sys.stderr.write('end %s\n' % tag)
554
if tag.find(':') <> -1:
555
prefix, suffix = tag.split(':', 1)
557
prefix, suffix = '', tag
558
prefix = self.namespacemap.get(prefix, prefix)
560
prefix = prefix + '_'
562
# call special handler (if defined) or default handler
563
methodname = '_end_' + prefix + suffix
565
method = getattr(self, methodname)
567
except AttributeError:
568
self.pop(prefix + suffix)
570
# track inline content
571
if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
572
# element declared itself as escaped markup, but it isn't really
573
self.contentparams['type'] = 'application/xhtml+xml'
574
if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
575
tag = tag.split(':')[-1]
576
self.handle_data('</%s>' % tag, escape=0)
578
# track xml:base and xml:lang going out of scope
581
if self.basestack and self.basestack[-1]:
582
self.baseuri = self.basestack[-1]
585
if self.langstack: # and (self.langstack[-1] is not None):
586
self.lang = self.langstack[-1]
588
def handle_charref(self, ref):
589
# called for each character reference, e.g. for ' ', ref will be '160'
590
if not self.elementstack: return
592
if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
599
text = unichr(c).encode('utf-8')
600
self.elementstack[-1][2].append(text)
602
def handle_entityref(self, ref):
603
# called for each entity reference, e.g. for '©', ref will be 'copy'
604
if not self.elementstack: return
605
if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
606
if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
609
# entity resolution graciously donated by Aaron Swartz
611
import htmlentitydefs
612
if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
613
return htmlentitydefs.name2codepoint[k]
614
k = htmlentitydefs.entitydefs[k]
615
if k.startswith('&#') and k.endswith(';'):
616
return int(k[2:-1]) # not in latin-1
619
except KeyError: text = '&%s;' % ref
620
else: text = unichr(name2cp(ref)).encode('utf-8')
621
self.elementstack[-1][2].append(text)
623
def handle_data(self, text, escape=1):
624
# called for each block of plain text, i.e. outside of any tag and
625
# not containing any character or entity references
626
if not self.elementstack: return
627
if escape and self.contentparams.get('type') == 'application/xhtml+xml':
628
text = _xmlescape(text)
629
self.elementstack[-1][2].append(text)
631
def handle_comment(self, text):
632
# called for each comment, e.g. <!-- insert message here -->
635
def handle_pi(self, text):
636
# called for each processing instruction, e.g. <?instruction>
639
def handle_decl(self, text):
642
def parse_declaration(self, i):
643
# override internal declaration handler to handle CDATA blocks
644
if _debug: sys.stderr.write('entering parse_declaration\n')
645
if self.rawdata[i:i+9] == '<![CDATA[':
646
k = self.rawdata.find(']]>', i)
647
if k == -1: k = len(self.rawdata)
648
self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
651
k = self.rawdata.find('>', i)
654
def mapContentType(self, contentType):
655
contentType = contentType.lower()
656
if contentType == 'text':
657
contentType = 'text/plain'
658
elif contentType == 'html':
659
contentType = 'text/html'
660
elif contentType == 'xhtml':
661
contentType = 'application/xhtml+xml'
664
def trackNamespace(self, prefix, uri):
665
loweruri = uri.lower()
666
if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
667
self.version = 'rss090'
668
if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
669
self.version = 'rss10'
670
if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
671
self.version = 'atom10'
672
if loweruri.find('backend.userland.com/rss') <> -1:
673
# match any backend.userland.com namespace
674
uri = 'http://backend.userland.com/rss'
676
if self._matchnamespaces.has_key(loweruri):
677
self.namespacemap[prefix] = self._matchnamespaces[loweruri]
678
self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
680
self.namespacesInUse[prefix or ''] = uri
682
def resolveURI(self, uri):
683
return _urljoin(self.baseuri or '', uri)
685
def decodeEntities(self, element, data):
688
def push(self, element, expectingText):
689
self.elementstack.append([element, expectingText, []])
691
def pop(self, element, stripWhitespace=1):
692
if not self.elementstack: return
693
if self.elementstack[-1][0] != element: return
695
element, expectingText, pieces = self.elementstack.pop()
696
output = ''.join(pieces)
698
output = output.strip()
699
if not expectingText: return output
701
# decode base64 content
702
if base64 and self.contentparams.get('base64', 0):
704
output = base64.decodestring(output)
705
except binascii.Error:
707
except binascii.Incomplete:
710
# resolve relative URIs
711
if (element in self.can_be_relative_uri) and output:
712
output = self.resolveURI(output)
714
# decode entities within embedded markup
715
if not self.contentparams.get('base64', 0):
716
output = self.decodeEntities(element, output)
718
# remove temporary cruft from contentparams
720
del self.contentparams['mode']
724
del self.contentparams['base64']
728
# resolve relative URIs within embedded markup
729
if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
730
if element in self.can_contain_relative_uris:
731
output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
733
# sanitize embedded markup
734
if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
735
if element in self.can_contain_dangerous_markup:
736
output = sanitizeHTML(output, self.encoding)
738
if self.encoding and type(output) != type(u''):
740
output = unicode(output, self.encoding)
741
except (SystemExit, KeyboardInterrupt):
746
# categories/tags/keywords/whatever are handled in _end_category
747
if element == 'category':
750
# store output in appropriate place(s)
751
if self.inentry and not self.insource:
752
if element == 'content':
753
self.entries[-1].setdefault(element, [])
754
contentparams = copy.deepcopy(self.contentparams)
755
contentparams['value'] = output
756
self.entries[-1][element].append(contentparams)
757
elif element == 'link':
758
self.entries[-1][element] = output
760
self.entries[-1]['links'][-1]['href'] = output
762
if element == 'description':
764
self.entries[-1][element] = output
766
contentparams = copy.deepcopy(self.contentparams)
767
contentparams['value'] = output
768
self.entries[-1][element + '_detail'] = contentparams
769
elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage):
770
context = self._getContext()
771
if element == 'description':
773
context[element] = output
774
if element == 'link':
775
context['links'][-1]['href'] = output
777
contentparams = copy.deepcopy(self.contentparams)
778
contentparams['value'] = output
779
context[element + '_detail'] = contentparams
782
def pushContent(self, tag, attrsD, defaultContentType, expectingText):
784
self.contentparams = FeedParserDict({
785
'type': self.mapContentType(attrsD.get('type', defaultContentType)),
786
'language': self.lang,
787
'base': self.baseuri})
788
self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
789
self.push(tag, expectingText)
791
def popContent(self, tag):
792
value = self.pop(tag)
794
self.contentparams.clear()
797
def _mapToStandardPrefix(self, name):
798
colonpos = name.find(':')
800
prefix = name[:colonpos]
801
suffix = name[colonpos+1:]
802
prefix = self.namespacemap.get(prefix, prefix)
803
name = prefix + ':' + suffix
806
def _getAttribute(self, attrsD, name):
807
return attrsD.get(self._mapToStandardPrefix(name))
809
def _isBase64(self, attrsD, contentparams):
810
if attrsD.get('mode', '') == 'base64':
812
# We should never assume text is base64 --NN
815
if self.contentparams['type'].startswith('text/'):
817
if self.contentparams['type'].endswith('+xml'):
819
if self.contentparams['type'].endswith('/xml'):
823
def _itsAnHrefDamnIt(self, attrsD):
824
href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
834
attrsD['href'] = href
837
def _save(self, key, value):
838
context = self._getContext()
839
context.setdefault(key, value)
841
def _start_rss(self, attrsD):
842
versionmap = {'0.91': 'rss091u',
847
attr_version = attrsD.get('version', '')
848
version = versionmap.get(attr_version)
850
self.version = version
851
elif attr_version.startswith('2.'):
852
self.version = 'rss20'
856
def _start_dlhottitles(self, attrsD):
857
self.version = 'hotrss'
859
def _start_channel(self, attrsD):
861
self._cdf_common(attrsD)
862
_start_feedinfo = _start_channel
864
def _cdf_common(self, attrsD):
865
if attrsD.has_key('lastmod'):
866
self._start_modified({})
867
self.elementstack[-1][-1] = attrsD['lastmod']
869
if attrsD.has_key('href'):
871
self.elementstack[-1][-1] = attrsD['href']
874
def _start_feed(self, attrsD):
876
versionmap = {'0.1': 'atom01',
880
attr_version = attrsD.get('version')
881
version = versionmap.get(attr_version)
883
self.version = version
885
self.version = 'atom'
887
def _end_channel(self):
889
_end_feed = _end_channel
891
def _start_image(self, attrsD):
893
self.push('image', 0)
894
context = self._getContext()
895
context.setdefault('image', FeedParserDict())
897
def _end_image(self):
901
def _start_textinput(self, attrsD):
903
self.push('textinput', 0)
904
context = self._getContext()
905
context.setdefault('textinput', FeedParserDict())
906
_start_textInput = _start_textinput
908
def _end_textinput(self):
909
self.pop('textinput')
911
_end_textInput = _end_textinput
913
def _start_author(self, attrsD):
915
self.push('author', 1)
916
_start_managingeditor = _start_author
917
_start_dc_author = _start_author
918
_start_dc_creator = _start_author
919
_start_itunes_author = _start_author
921
def _end_author(self):
924
self._sync_author_detail()
925
_end_managingeditor = _end_author
926
_end_dc_author = _end_author
927
_end_dc_creator = _end_author
928
_end_itunes_author = _end_author
930
def _start_itunes_owner(self, attrsD):
932
self.push('publisher', 0)
934
def _end_itunes_owner(self):
935
self.pop('publisher')
937
self._sync_author_detail('publisher')
939
def _start_contributor(self, attrsD):
940
self.incontributor = 1
941
context = self._getContext()
942
context.setdefault('contributors', [])
943
context['contributors'].append(FeedParserDict())
944
self.push('contributor', 0)
946
def _end_contributor(self):
947
self.pop('contributor')
948
self.incontributor = 0
950
def _start_dc_contributor(self, attrsD):
951
self.incontributor = 1
952
context = self._getContext()
953
context.setdefault('contributors', [])
954
context['contributors'].append(FeedParserDict())
957
def _end_dc_contributor(self):
959
self.incontributor = 0
961
def _start_name(self, attrsD):
963
_start_itunes_name = _start_name
966
value = self.pop('name')
968
self._save_author('name', value, 'publisher')
970
self._save_author('name', value)
971
elif self.incontributor:
972
self._save_contributor('name', value)
973
elif self.intextinput:
974
context = self._getContext()
975
context['textinput']['name'] = value
976
_end_itunes_name = _end_name
978
def _start_width(self, attrsD):
979
self.push('width', 0)
981
def _end_width(self):
982
value = self.pop('width')
985
except (SystemExit, KeyboardInterrupt):
990
context = self._getContext()
991
context['image']['width'] = value
993
def _start_height(self, attrsD):
994
self.push('height', 0)
996
def _end_height(self):
997
value = self.pop('height')
1000
except (SystemExit, KeyboardInterrupt):
1005
context = self._getContext()
1006
context['image']['height'] = value
1008
def _start_url(self, attrsD):
1009
self.push('href', 1)
1010
_start_homepage = _start_url
1011
_start_uri = _start_url
1014
value = self.pop('href')
1016
self._save_author('href', value)
1017
elif self.incontributor:
1018
self._save_contributor('href', value)
1020
context = self._getContext()
1021
context['image']['href'] = value
1022
elif self.intextinput:
1023
context = self._getContext()
1024
context['textinput']['link'] = value
1025
_end_homepage = _end_url
1028
def _start_email(self, attrsD):
1029
self.push('email', 0)
1030
_start_itunes_email = _start_email
1032
def _end_email(self):
1033
value = self.pop('email')
1034
if self.inpublisher:
1035
self._save_author('email', value, 'publisher')
1037
self._save_author('email', value)
1038
elif self.incontributor:
1039
self._save_contributor('email', value)
1040
_end_itunes_email = _end_email
1042
def _getContext(self):
1044
context = self.sourcedata
1046
context = self.entries[-1]
1048
context = self.feeddata
1051
def _save_author(self, key, value, prefix='author'):
1052
context = self._getContext()
1053
context.setdefault(prefix + '_detail', FeedParserDict())
1054
context[prefix + '_detail'][key] = value
1055
self._sync_author_detail()
1057
def _save_contributor(self, key, value):
1058
context = self._getContext()
1059
context.setdefault('contributors', [FeedParserDict()])
1060
context['contributors'][-1][key] = value
1062
def _sync_author_detail(self, key='author'):
1063
context = self._getContext()
1064
detail = context.get('%s_detail' % key)
1066
name = detail.get('name')
1067
email = detail.get('email')
1069
context[key] = '%s (%s)' % (name, email)
1073
context[key] = email
1075
author = context.get(key)
1076
if not author: return
1077
emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author)
1078
if not emailmatch: return
1079
email = emailmatch.group(0)
1080
# probably a better way to do the following, but it passes all the tests
1081
author = author.replace(email, '')
1082
author = author.replace('()', '')
1083
author = author.strip()
1084
if author and (author[0] == '('):
1086
if author and (author[-1] == ')'):
1087
author = author[:-1]
1088
author = author.strip()
1089
context.setdefault('%s_detail' % key, FeedParserDict())
1090
context['%s_detail' % key]['name'] = author
1091
context['%s_detail' % key]['email'] = email
1093
def _start_subtitle(self, attrsD):
1094
self.pushContent('subtitle', attrsD, 'text/plain', 1)
1095
_start_tagline = _start_subtitle
1096
_start_itunes_subtitle = _start_subtitle
1098
def _end_subtitle(self):
1099
self.popContent('subtitle')
1100
_end_tagline = _end_subtitle
1101
_end_itunes_subtitle = _end_subtitle
1103
def _start_rights(self, attrsD):
1104
self.pushContent('rights', attrsD, 'text/plain', 1)
1105
_start_dc_rights = _start_rights
1106
_start_copyright = _start_rights
1108
def _end_rights(self):
1109
self.popContent('rights')
1110
_end_dc_rights = _end_rights
1111
_end_copyright = _end_rights
1113
def _start_item(self, attrsD):
1114
self.entries.append(FeedParserDict())
1115
self.push('item', 0)
1118
id = self._getAttribute(attrsD, 'rdf:about')
1120
context = self._getContext()
1122
self._cdf_common(attrsD)
1123
_start_entry = _start_item
1124
_start_product = _start_item
1126
def _end_item(self):
1129
_end_entry = _end_item
1131
def _start_dc_language(self, attrsD):
1132
self.push('language', 1)
1133
_start_language = _start_dc_language
1135
def _end_dc_language(self):
1136
self.lang = self.pop('language')
1137
_end_language = _end_dc_language
1139
def _start_dc_publisher(self, attrsD):
1140
self.push('publisher', 1)
1141
_start_webmaster = _start_dc_publisher
1143
def _end_dc_publisher(self):
1144
self.pop('publisher')
1145
self._sync_author_detail('publisher')
1146
_end_webmaster = _end_dc_publisher
1148
def _start_published(self, attrsD):
1149
self.push('published', 1)
1150
_start_dcterms_issued = _start_published
1151
_start_issued = _start_published
1153
def _end_published(self):
1154
value = self.pop('published')
1155
self._save('published_parsed', _parse_date(value))
1156
_end_dcterms_issued = _end_published
1157
_end_issued = _end_published
1159
def _start_updated(self, attrsD):
1160
self.push('updated', 1)
1161
_start_modified = _start_updated
1162
_start_dcterms_modified = _start_updated
1163
_start_pubdate = _start_updated
1164
_start_dc_date = _start_updated
1166
def _end_updated(self):
1167
value = self.pop('updated')
1168
parsed_value = _parse_date(value)
1169
self._save('updated_parsed', parsed_value)
1170
_end_modified = _end_updated
1171
_end_dcterms_modified = _end_updated
1172
_end_pubdate = _end_updated
1173
_end_dc_date = _end_updated
1175
def _start_created(self, attrsD):
1176
self.push('created', 1)
1177
_start_dcterms_created = _start_created
1179
def _end_created(self):
1180
value = self.pop('created')
1181
self._save('created_parsed', _parse_date(value))
1182
_end_dcterms_created = _end_created
1184
def _start_expirationdate(self, attrsD):
1185
self.push('expired', 1)
1187
def _end_expirationdate(self):
1188
self._save('expired_parsed', _parse_date(self.pop('expired')))
1190
def _start_cc_license(self, attrsD):
1191
self.push('license', 1)
1192
value = self._getAttribute(attrsD, 'rdf:resource')
1194
self.elementstack[-1][2].append(value)
1197
def _start_creativecommons_license(self, attrsD):
1198
self.push('license', 1)
1200
def _end_creativecommons_license(self):
1203
def _addTag(self, term, scheme, label):
1204
context = self._getContext()
1205
tags = context.setdefault('tags', [])
1206
if (not term) and (not scheme) and (not label): return
1207
value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
1208
if value not in tags:
1209
tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label}))
1211
def _start_category(self, attrsD):
1212
if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
1213
term = attrsD.get('term')
1214
scheme = attrsD.get('scheme', attrsD.get('domain'))
1215
label = attrsD.get('label')
1216
self._addTag(term, scheme, label)
1217
self.push('category', 1)
1218
_start_dc_subject = _start_category
1219
_start_keywords = _start_category
1220
_start_media_category = _start_category
1222
def _end_itunes_keywords(self):
1223
for term in self.pop('itunes_keywords').split():
1224
self._addTag(term, 'http://www.itunes.com/', None)
1226
def _start_itunes_category(self, attrsD):
1227
self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
1228
self.push('category', 1)
1230
def _end_category(self):
1231
value = self.pop('category')
1232
if not value: return
1233
context = self._getContext()
1234
tags = context['tags']
1235
if value and len(tags) and not tags[-1]['term']:
1236
tags[-1]['term'] = value
1238
self._addTag(value, None, None)
1239
_end_dc_subject = _end_category
1240
_end_keywords = _end_category
1241
_end_itunes_category = _end_category
1242
_end_media_category = _end_category
1244
def _start_cloud(self, attrsD):
1245
self._getContext()['cloud'] = FeedParserDict(attrsD)
1247
def _start_link(self, attrsD):
1248
attrsD.setdefault('rel', 'alternate')
1249
attrsD.setdefault('type', 'text/html')
1250
attrsD = self._itsAnHrefDamnIt(attrsD)
1251
if attrsD.has_key('href'):
1252
attrsD['href'] = self.resolveURI(attrsD['href'])
1253
expectingText = self.infeed or self.inentry or self.insource
1254
context = self._getContext()
1255
context.setdefault('links', [])
1256
context['links'].append(FeedParserDict(attrsD))
1257
if attrsD['rel'] == 'enclosure':
1258
self._start_enclosure(attrsD)
1259
if attrsD.has_key('href'):
1261
if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
1262
context['link'] = attrsD['href']
1264
self.push('link', expectingText)
1265
_start_producturl = _start_link
1267
def _end_link(self):
1268
value = self.pop('link')
1269
context = self._getContext()
1270
if self.intextinput:
1271
context['textinput']['link'] = value
1273
context['image']['link'] = value
1274
_end_producturl = _end_link
1276
def _start_guid(self, attrsD):
1277
self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
1280
def _end_guid(self):
1281
value = self.pop('id')
1282
self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
1284
# guid acts as link, but only if 'ispermalink' is not present or is 'true',
1285
# and only if the item doesn't already have a link element
1286
self._save('link', value)
1288
def _start_title(self, attrsD):
1289
self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1290
_start_dc_title = _start_title
1291
_start_media_title = _start_title
1293
def _end_title(self):
1294
value = self.popContent('title')
1295
context = self._getContext()
1296
if self.intextinput:
1297
context['textinput']['title'] = value
1299
context['image']['title'] = value
1300
_end_dc_title = _end_title
1301
_end_media_title = _end_title
1303
def _start_description(self, attrsD):
1304
context = self._getContext()
1305
if context.has_key('summary'):
1306
self._summaryKey = 'content'
1307
self._start_content(attrsD)
1309
self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
1311
def _start_abstract(self, attrsD):
1312
self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1314
def _end_description(self):
1315
if self._summaryKey == 'content':
1318
value = self.popContent('description')
1319
context = self._getContext()
1320
if self.intextinput:
1321
context['textinput']['description'] = value
1323
context['image']['description'] = value
1324
self._summaryKey = None
1325
_end_abstract = _end_description
1327
def _start_info(self, attrsD):
1328
self.pushContent('info', attrsD, 'text/plain', 1)
1329
_start_feedburner_browserfriendly = _start_info
1331
def _end_info(self):
1332
self.popContent('info')
1333
_end_feedburner_browserfriendly = _end_info
1335
def _start_generator(self, attrsD):
1337
attrsD = self._itsAnHrefDamnIt(attrsD)
1338
if attrsD.has_key('href'):
1339
attrsD['href'] = self.resolveURI(attrsD['href'])
1340
self._getContext()['generator_detail'] = FeedParserDict(attrsD)
1341
self.push('generator', 1)
1343
def _end_generator(self):
1344
value = self.pop('generator')
1345
context = self._getContext()
1346
if context.has_key('generator_detail'):
1347
context['generator_detail']['name'] = value
1349
def _start_admin_generatoragent(self, attrsD):
1350
self.push('generator', 1)
1351
value = self._getAttribute(attrsD, 'rdf:resource')
1353
self.elementstack[-1][2].append(value)
1354
self.pop('generator')
1355
self._getContext()['generator_detail'] = FeedParserDict({'href': value})
1357
def _start_admin_errorreportsto(self, attrsD):
1358
self.push('errorreportsto', 1)
1359
value = self._getAttribute(attrsD, 'rdf:resource')
1361
self.elementstack[-1][2].append(value)
1362
self.pop('errorreportsto')
1364
def _start_summary(self, attrsD):
1365
context = self._getContext()
1366
if context.has_key('summary'):
1367
self._summaryKey = 'content'
1368
self._start_content(attrsD)
1370
self._summaryKey = 'summary'
1371
self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)
1372
_start_itunes_summary = _start_summary
1374
def _end_summary(self):
1375
if self._summaryKey == 'content':
1378
self.popContent(self._summaryKey or 'summary')
1379
self._summaryKey = None
1380
_end_itunes_summary = _end_summary
1382
def _start_enclosure(self, attrsD):
1383
self.inenclosure += 1
1384
attrsD = self._itsAnHrefDamnIt(attrsD)
1385
self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD))
1386
_start_media_content = _start_enclosure
1388
def _end_enclosure(self):
1389
self.inenclosure -= 1
1390
_end_media_content = _end_enclosure
1392
def _start_media_thumbnail(self,attrsD):
1393
self.push('media:thumbnail',1)
1395
if self.inenclosure:
1396
self.entries[-1]['enclosures'][-1]['thumbnail']=FeedParserDict(attrsD)
1398
self.entries[-1]['thumbnail'] = FeedParserDict(attrsD)
1400
def _end_media_thumbnail(self):
1401
self.pop('media:thumbnail')
1403
def _start_media_text(self,attrsD):
1404
self.push('media:text',1)
1406
def _end_media_text(self):
1407
value = self.pop('media:text')
1409
if self.inenclosure:
1410
self.entries[-1]['enclosures'][-1]['text'] = value
1412
self.entries[-1]['text'] = value
1414
def _start_media_people(self,attrsD):
1415
self.push('media:people',1)
1417
self.peoplerole = attrsD['role']
1418
except (SystemExit, KeyboardInterrupt):
1421
self.peoplerole = 'unknown'
1423
def _end_media_people(self):
1424
value = self.pop('media:people').split('|')
1426
if self.inenclosure:
1427
self.entries[-1]['enclosures'][-1].setdefault('roles', {})
1428
self.entries[-1]['enclosures'][-1].roles[self.peoplerole]=value
1430
self.entries[-1].setdefault('roles', {})
1431
self.entries[-1].roles[self.peoplerole]=value
1433
def _start_dtv_startnback(self,attrsD):
1434
self.push('dtv:startnback',1)
1436
def _end_dtv_startnback(self):
1437
self.feeddata['startnback'] = self.pop('dtv:startnback')
1439
def _start_dtv_librarylink(self,attrsD):
1440
self.push('dtv:librarylink',1)
1442
def _end_dtv_librarylink(self):
1443
self.feeddata['librarylink'] = self.pop('dtv:librarylink')
1445
def _start_dtv_releasedate(self,attrsD):
1446
self.push('dtv:releasedate',1)
1448
def _end_dtv_releasedate(self):
1449
value = self.pop('dtv:releasedate')
1451
if self.inenclosure:
1452
self.entries[-1]['enclosures'][-1]['releasedate'] = value
1453
self.entries[-1]['enclosures'][-1]['releasedate_parsed'] = _parse_date(value)
1455
self.entries[-1]['releasedate'] = value
1456
self.entries[-1]['releasedate_parsed'] = _parse_date(value)
1458
def _start_dtv_paymentlink(self,attrsD):
1460
self.contentparams['mode'] = 'xml'
1461
self.contentparams['type'] = 'application/xhtml+xml'
1462
self.push('dtv:paymentlink',1)
1464
if attrsD.has_key('url'):
1465
if self.inenclosure:
1466
self.entries[-1]['enclosures'][-1]['payment_url'] = attrsD['url']
1468
self.entries[-1]['payment_url'] = attrsD['url']
1470
def _end_dtv_paymentlink(self):
1471
value = sanitizeHTML(self.pop('dtv:paymentlink'),self.encoding)
1473
self.contentparams.clear()
1475
if self.inenclosure:
1476
self.entries[-1]['enclosures'][-1]['payment_html'] = value
1478
self.entries[-1]['payment_html'] = value
1480
def _start_source(self, attrsD):
1483
def _end_source(self):
1485
self._getContext()['source'] = copy.deepcopy(self.sourcedata)
1486
self.sourcedata.clear()
1488
def _start_content(self, attrsD):
1489
self.pushContent('content', attrsD, 'text/plain', 1)
1490
src = attrsD.get('src')
1492
self.contentparams['src'] = src
1493
self.push('content', 1)
1495
def _start_prodlink(self, attrsD):
1496
self.pushContent('content', attrsD, 'text/html', 1)
1498
def _start_body(self, attrsD):
1499
self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
1500
_start_xhtml_body = _start_body
1502
def _start_content_encoded(self, attrsD):
1503
self.pushContent('content', attrsD, 'text/html', 1)
1504
_start_fullitem = _start_content_encoded
1506
def _end_content(self):
1507
copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
1508
value = self.popContent('content')
1509
if copyToDescription:
1510
self._save('description', value)
1511
_end_body = _end_content
1512
_end_xhtml_body = _end_content
1513
_end_content_encoded = _end_content
1514
_end_fullitem = _end_content
1515
_end_prodlink = _end_content
1517
def _start_itunes_image(self, attrsD):
1518
self.push('itunes_image', 0)
1519
self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
1521
def _start_itunes_link(self, attrsD):
1522
self.push('itunes_link', 0)
1523
self._getContext()['link'] = FeedParserDict({'href': attrsD.get('href')})
1525
def _end_itunes_block(self):
1526
value = self.pop('itunes_block', 0)
1527
self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
1529
def _end_itunes_explicit(self):
1530
value = self.pop('itunes_explicit', 0)
1531
self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0
1534
class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1535
def __init__(self, baseuri, baselang, encoding):
1536
if _debug: sys.stderr.write('trying StrictFeedParser\n')
1537
xml.sax.handler.ContentHandler.__init__(self)
1538
_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1542
def startPrefixMapping(self, prefix, uri):
1543
self.trackNamespace(prefix, uri)
1545
def startElementNS(self, name, qname, attrs):
1546
namespace, localname = name
1547
lowernamespace = str(namespace or '').lower()
1548
if lowernamespace.find('backend.userland.com/rss') <> -1:
1549
# match any backend.userland.com namespace
1550
namespace = 'http://backend.userland.com/rss'
1551
lowernamespace = namespace
1552
if qname and qname.find(':') > 0:
1553
givenprefix = qname.split(':')[0]
1556
prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1557
if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
1558
raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
1560
localname = prefix + ':' + localname
1561
localname = str(localname).lower()
1562
if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
1564
# qname implementation is horribly broken in Python 2.1 (it
1565
# doesn't report any), and slightly broken in Python 2.2 (it
1566
# doesn't report the xml: namespace). So we match up namespaces
1567
# with a known list first, and then possibly override them with
1568
# the qnames the SAX parser gives us (if indeed it gives us any
1569
# at all). Thanks to MatejC for helping me test this and
1570
# tirelessly telling me that it didn't work yet.
1572
for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
1573
lowernamespace = (namespace or '').lower()
1574
prefix = self._matchnamespaces.get(lowernamespace, '')
1576
attrlocalname = prefix + ':' + attrlocalname
1577
attrsD[str(attrlocalname).lower()] = attrvalue
1578
for qname in attrs.getQNames():
1579
attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1580
self.unknown_starttag(localname, attrsD.items())
1582
def characters(self, text):
1583
self.handle_data(text)
1585
def endElementNS(self, name, qname):
1586
namespace, localname = name
1587
lowernamespace = str(namespace or '').lower()
1588
if qname and qname.find(':') > 0:
1589
givenprefix = qname.split(':')[0]
1592
prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1594
localname = prefix + ':' + localname
1595
localname = str(localname).lower()
1596
self.unknown_endtag(localname)
1598
def error(self, exc):
1602
def fatalError(self, exc):
1606
class _BaseHTMLProcessor(sgmllib.SGMLParser):
1607
elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
1608
'img', 'input', 'isindex', 'link', 'meta', 'param']
1610
def __init__(self, encoding):
1611
self.encoding = encoding
1612
if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
1613
sgmllib.SGMLParser.__init__(self)
1617
sgmllib.SGMLParser.reset(self)
1619
def _shorttag_replace(self, match):
1620
tag = match.group(1)
1621
if tag in self.elements_no_end_tag:
1622
return '<' + tag + ' />'
1624
return '<' + tag + '></' + tag + '>'
1626
def feed(self, data):
1627
data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)
1628
#data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
1629
data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data)
1630
data = data.replace(''', "'")
1631
data = data.replace('"', '"')
1632
if self.encoding and type(data) == type(u''):
1633
data = data.encode(self.encoding)
1634
sgmllib.SGMLParser.feed(self, data)
1636
def normalize_attrs(self, attrs):
1637
# utility method to be called by descendants
1638
attrs = [(k.lower(), v) for k, v in attrs]
1639
attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
1642
def parse_starttag(self, i):
1643
retval = sgmllib.SGMLParser.parse_starttag(self, i)
1645
if self.get_starttag_text()[-2:] == "/>":
1646
self.finish_endtag(self.lasttag)
1647
except (SystemExit, KeyboardInterrupt):
1653
def unknown_starttag(self, tag, attrs):
1654
# called for each start tag
1655
# attrs is a list of (attr, value) tuples
1656
# e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
1657
if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
1659
# thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
1660
for key, value in attrs:
1661
if type(value) != type(u''):
1662
value = unicode(value, self.encoding)
1663
uattrs.append((unicode(key, self.encoding), value))
1664
strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
1665
if tag in self.elements_no_end_tag:
1666
self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
1668
self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
1670
def unknown_endtag(self, tag):
1671
# called for each end tag, e.g. for </pre>, tag will be 'pre'
1672
# Reconstruct the original end tag.
1673
if tag not in self.elements_no_end_tag:
1674
self.pieces.append("</%(tag)s>" % locals())
1676
def handle_charref(self, ref):
1677
# called for each character reference, e.g. for ' ', ref will be '160'
1678
# Reconstruct the original character reference.
1679
self.pieces.append('&#%(ref)s;' % locals())
1681
def handle_entityref(self, ref):
1682
# called for each entity reference, e.g. for '©', ref will be 'copy'
1683
# Reconstruct the original entity reference.
1684
self.pieces.append('&%(ref)s;' % locals())
1686
def handle_data(self, text):
1687
# called for each block of plain text, i.e. outside of any tag and
1688
# not containing any character or entity references
1689
# Store the original text verbatim.
1690
if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
1691
self.pieces.append(text)
1693
def handle_comment(self, text):
1694
# called for each HTML comment, e.g. <!-- insert Javascript code here -->
1695
# Reconstruct the original comment.
1696
self.pieces.append('<!--%(text)s-->' % locals())
1698
def handle_pi(self, text):
1699
# called for each processing instruction, e.g. <?instruction>
1700
# Reconstruct original processing instruction.
1701
self.pieces.append('<?%(text)s>' % locals())
1703
def handle_decl(self, text):
1704
# called for the DOCTYPE, if present, e.g.
1705
# <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1706
# "http://www.w3.org/TR/html4/loose.dtd">
1707
# Reconstruct original DOCTYPE
1708
self.pieces.append('<!%(text)s>' % locals())
1710
_new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
1711
def _scan_name(self, i, declstartpos):
1712
rawdata = self.rawdata
1716
m = self._new_declname_match(rawdata, i)
1720
if (i + len(s)) == n:
1721
return None, -1 # end of buffer
1722
return name.lower(), m.end()
1724
self.handle_data(rawdata)
1725
# self.updatepos(declstartpos, i)
1729
'''Return processed HTML as a single string'''
1730
return ''.join([str(p) for p in self.pieces])
1732
class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
1733
def __init__(self, baseuri, baselang, encoding):
1734
sgmllib.SGMLParser.__init__(self)
1735
_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1737
def decodeEntities(self, element, data):
1738
data = data.replace('<', '<')
1739
data = data.replace('<', '<')
1740
data = data.replace('>', '>')
1741
data = data.replace('>', '>')
1742
data = data.replace('&', '&')
1743
data = data.replace('&', '&')
1744
data = data.replace('"', '"')
1745
data = data.replace('"', '"')
1746
data = data.replace(''', ''')
1747
data = data.replace(''', ''')
1748
if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
1749
data = data.replace('<', '<')
1750
data = data.replace('>', '>')
1751
data = data.replace('&', '&')
1752
data = data.replace('"', '"')
1753
data = data.replace(''', "'")
1756
class _RelativeURIResolver(_BaseHTMLProcessor):
1757
relative_uris = [('a', 'href'),
1758
('applet', 'codebase'),
1760
('blockquote', 'cite'),
1761
('body', 'background'),
1764
('frame', 'longdesc'),
1766
('iframe', 'longdesc'),
1768
('head', 'profile'),
1769
('img', 'longdesc'),
1773
('input', 'usemap'),
1776
('object', 'classid'),
1777
('object', 'codebase'),
1779
('object', 'usemap'),
1783
def __init__(self, baseuri, encoding):
1784
_BaseHTMLProcessor.__init__(self, encoding)
1785
self.baseuri = baseuri
1787
def resolveURI(self, uri):
1788
return _urljoin(self.baseuri, uri)
1790
def unknown_starttag(self, tag, attrs):
1791
attrs = self.normalize_attrs(attrs)
1792
attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
1793
_BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1795
def _resolveRelativeURIs(htmlSource, baseURI, encoding):
1796
if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
1797
p = _RelativeURIResolver(baseURI, encoding)
1801
class _HTMLSanitizer(_BaseHTMLProcessor):
1802
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
1803
'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
1804
'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
1805
'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
1806
'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
1807
'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
1808
'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
1809
'thead', 'tr', 'tt', 'u', 'ul', 'var']
1811
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
1812
'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
1813
'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
1814
'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
1815
'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
1816
'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
1817
'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
1818
'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
1819
'span', 'src', 'start', 'summary', 'tabindex', 'title', 'type',
1820
'usemap', 'valign', 'value', 'vspace', 'width']
1822
unacceptable_elements_with_end_tag = ['script', 'applet']
1825
_BaseHTMLProcessor.reset(self)
1826
self.unacceptablestack = 0
1828
def unknown_starttag(self, tag, attrs):
1829
if not tag in self.acceptable_elements:
1830
if tag in self.unacceptable_elements_with_end_tag:
1831
self.unacceptablestack += 1
1833
attrs = self.normalize_attrs(attrs)
1834
attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
1835
_BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1837
def unknown_endtag(self, tag):
1838
if not tag in self.acceptable_elements:
1839
if tag in self.unacceptable_elements_with_end_tag:
1840
self.unacceptablestack -= 1
1842
_BaseHTMLProcessor.unknown_endtag(self, tag)
1844
def handle_pi(self, text):
1847
def handle_decl(self, text):
1850
def handle_data(self, text):
1851
if not self.unacceptablestack:
1852
_BaseHTMLProcessor.handle_data(self, text)
1854
def sanitizeHTML(htmlSource, encoding):
1855
p = _HTMLSanitizer(encoding)
1859
# loop through list of preferred Tidy interfaces looking for one that's installed,
1860
# then set up a common _tidy function to wrap the interface-specific API.
1862
for tidy_interface in PREFERRED_TIDY_INTERFACES:
1864
if tidy_interface == "uTidy":
1865
from tidy import parseString as _utidy
1866
def _tidy(data, **kwargs):
1867
return str(_utidy(data, **kwargs))
1869
elif tidy_interface == "mxTidy":
1870
from mx.Tidy import Tidy as _mxtidy
1871
def _tidy(data, **kwargs):
1872
nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
1875
except (SystemExit, KeyboardInterrupt):
1880
utf8 = type(data) == type(u'')
1882
data = data.encode('utf-8')
1883
data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
1885
data = unicode(data, 'utf-8')
1886
if data.count('<body'):
1887
data = data.split('<body', 1)[1]
1889
data = data.split('>', 1)[1]
1890
if data.count('</body'):
1891
data = data.split('</body', 1)[0]
1892
data = data.strip().replace('\r\n', '\n')
1895
class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
1896
def http_error_default(self, req, fp, code, msg, headers):
1897
if ((code / 100) == 3) and (code != 304):
1898
return self.http_error_302(req, fp, code, msg, headers)
1899
infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1900
infourl.status = code
1903
def http_error_302(self, req, fp, code, msg, headers):
1904
if headers.dict.has_key('location'):
1905
infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
1907
infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1908
if not hasattr(infourl, 'status'):
1909
infourl.status = code
1912
def http_error_301(self, req, fp, code, msg, headers):
1913
if headers.dict.has_key('location'):
1914
infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
1916
infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1917
if not hasattr(infourl, 'status'):
1918
infourl.status = code
1921
http_error_300 = http_error_302
1922
http_error_303 = http_error_302
1923
http_error_307 = http_error_302
1925
def http_error_401(self, req, fp, code, msg, headers):
1927
# - server requires digest auth, AND
1928
# - we tried (unsuccessfully) with basic auth, AND
1929
# - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)
1930
# If all conditions hold, parse authentication information
1931
# out of the Authorization header we sent the first time
1932
# (for the username and password) and the WWW-Authenticate
1933
# header the server sent back (for the realm) and retry
1934
# the request with the appropriate digest auth headers instead.
1935
# This evil genius hack has been brought to you by Aaron Swartz.
1936
host = urlparse.urlparse(req.get_full_url())[1]
1938
assert sys.version.split()[0] >= '2.3.3'
1939
assert base64 != None
1940
user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')
1941
realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
1942
self.add_password(realm, host, user, passw)
1943
retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
1944
self.reset_retry_count()
1946
except (SystemExit, KeyboardInterrupt):
1949
return self.http_error_default(req, fp, code, msg, headers)
1951
def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):
1952
"""URL, filename, or string --> stream
1954
This function lets you define parsers that take any input source
1955
(URL, pathname to local or network file, or actual data as a string)
1956
and deal with it in a uniform manner. Returned object is guaranteed
1957
to have all the basic stdio read methods (read, readline, readlines).
1958
Just .close() the object when you're done with it.
1960
If the etag argument is supplied, it will be used as the value of an
1961
If-None-Match request header.
1963
If the modified argument is supplied, it must be a tuple of 9 integers
1964
as returned by gmtime() in the standard Python time module. This MUST
1965
be in GMT (Greenwich Mean Time). The formatted date/time will be used
1966
as the value of an If-Modified-Since request header.
1968
If the agent argument is supplied, it will be used as the value of a
1969
User-Agent request header.
1971
If the referrer argument is supplied, it will be used as the value of a
1972
Referer[sic] request header.
1974
If handlers is supplied, it is a list of handlers used to build a
1978
if hasattr(url_file_stream_or_string, 'read'):
1979
return url_file_stream_or_string
1981
if url_file_stream_or_string == '-':
1984
if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
1987
# test for inline user:password for basic auth
1990
urltype, rest = urllib.splittype(url_file_stream_or_string)
1991
realhost, rest = urllib.splithost(rest)
1993
user_passwd, realhost = urllib.splituser(realhost)
1995
url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
1996
auth = base64.encodestring(user_passwd).strip()
1997
# try to open with urllib2 (to use optional headers)
1998
request = urllib2.Request(url_file_stream_or_string)
1999
request.add_header('User-Agent', agent)
2001
request.add_header('If-None-Match', etag)
2003
# format into an RFC 1123-compliant timestamp. We can't use
2004
# time.strftime() since the %a and %b directives can be affected
2005
# by the current locale, but RFC 2616 states that dates must be
2007
short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
2008
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
2009
request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
2011
request.add_header('Referer', referrer)
2013
request.add_header('Accept-encoding', 'gzip, deflate')
2015
request.add_header('Accept-encoding', 'gzip')
2017
request.add_header('Accept-encoding', 'deflate')
2019
request.add_header('Accept-encoding', '')
2021
request.add_header('Authorization', 'Basic %s' % auth)
2023
request.add_header('Accept', ACCEPT_HEADER)
2024
request.add_header('A-IM', 'feed') # RFC 3229 support
2025
opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))
2026
opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
2028
return opener.open(request)
2030
opener.close() # JohnD
2032
# try to open with native open function (if url_file_stream_or_string is a filename)
2034
return open(url_file_stream_or_string)
2035
except (SystemExit, KeyboardInterrupt):
2040
# treat url_file_stream_or_string as string
2041
return _StringIO(str(url_file_stream_or_string))
2044
def registerDateHandler(func):
2045
'''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
2046
_date_handlers.insert(0, func)
2048
# ISO-8601 date parsing routines written by Fazal Majid.
2049
# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
2050
# parser is beyond the scope of feedparser and would be a worthwhile addition
2051
# to the Python library.
2052
# A single regular expression cannot parse ISO 8601 date formats into groups
2053
# as the standard is highly irregular (for instance is 030104 2003-01-04 or
2054
# 0301-04-01), so we use templates instead.
2055
# Please note the order in templates is significant because we need a
2057
_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
2058
'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
2059
'-YY-?MM', '-OOO', '-YY',
2065
'YYYY', r'(?P<year>\d{4})').replace(
2066
'YY', r'(?P<year>\d\d)').replace(
2067
'MM', r'(?P<month>[01]\d)').replace(
2068
'DD', r'(?P<day>[0123]\d)').replace(
2069
'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
2070
'CC', r'(?P<century>\d\d$)')
2071
+ r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
2072
+ r'(:(?P<second>\d{2}))?'
2073
+ r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
2074
for tmpl in _iso8601_tmpl]
2076
_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
2078
def _parse_date_iso8601(dateString):
2079
'''Parse a variety of ISO-8601-compatible formats like 20040105'''
2081
for _iso8601_match in _iso8601_matches:
2082
m = _iso8601_match(dateString)
2085
if m.span() == (0, 0): return
2086
params = m.groupdict()
2087
ordinal = params.get('ordinal', 0)
2089
ordinal = int(ordinal)
2092
year = params.get('year', '--')
2093
if not year or year == '--':
2094
year = time.gmtime()[0]
2095
elif len(year) == 2:
2096
# ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
2097
year = 100 * int(time.gmtime()[0] / 100) + int(year)
2100
month = params.get('month', '-')
2101
if not month or month == '-':
2102
# ordinals are NOT normalized by mktime, we simulate them
2103
# by setting month=1, day=ordinal
2107
month = time.gmtime()[1]
2109
day = params.get('day', 0)
2114
elif params.get('century', 0) or \
2115
params.get('year', 0) or params.get('month', 0):
2118
day = time.gmtime()[2]
2121
# special case of the century - is the first year of the 21st century
2122
# 2000 or 2001 ? The debate goes on...
2123
if 'century' in params.keys():
2124
year = (int(params['century']) - 1) * 100 + 1
2125
# in ISO 8601 most fields are optional
2126
for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
2127
if not params.get(field, None):
2129
hour = int(params.get('hour', 0))
2130
minute = int(params.get('minute', 0))
2131
second = int(params.get('second', 0))
2132
# weekday is normalized by mktime(), we can ignore it
2134
# daylight savings is complex, but not needed for feedparser's purposes
2135
# as time zones, if specified, include mention of whether it is active
2136
# (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
2137
# and most implementations have DST bugs
2138
daylight_savings_flag = 0
2139
tm = [year, month, day, hour, minute, second, weekday,
2140
ordinal, daylight_savings_flag]
2141
# ISO 8601 time zone adjustments
2142
tz = params.get('tz')
2143
if tz and tz != 'Z':
2145
tm[3] += int(params.get('tzhour', 0))
2146
tm[4] += int(params.get('tzmin', 0))
2148
tm[3] -= int(params.get('tzhour', 0))
2149
tm[4] -= int(params.get('tzmin', 0))
2152
# Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
2153
# which is guaranteed to normalize d/m/y/h/m/s.
2154
# Many implementations have bugs, but we'll pretend they don't.
2155
return time.localtime(time.mktime(tm))
2156
registerDateHandler(_parse_date_iso8601)
2158
# 8-bit date handling routines written by ytrewq1.
2159
_korean_year = u'\ub144' # b3e2 in euc-kr
2160
_korean_month = u'\uc6d4' # bff9 in euc-kr
2161
_korean_day = u'\uc77c' # c0cf in euc-kr
2162
_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr
2163
_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
2165
_korean_onblog_date_re = \
2166
re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
2167
(_korean_year, _korean_month, _korean_day))
2168
_korean_nate_date_re = \
2169
re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
2170
(_korean_am, _korean_pm))
2171
def _parse_date_onblog(dateString):
2172
'''Parse a string according to the OnBlog 8-bit date format'''
2173
m = _korean_onblog_date_re.match(dateString)
2175
w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
2176
{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
2177
'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
2178
'zonediff': '+09:00'}
2179
if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
2180
return _parse_date_w3dtf(w3dtfdate)
2181
registerDateHandler(_parse_date_onblog)
2183
def _parse_date_nate(dateString):
2184
'''Parse a string according to the Nate 8-bit date format'''
2185
m = _korean_nate_date_re.match(dateString)
2187
hour = int(m.group(5))
2189
if (ampm == _korean_pm):
2194
w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
2195
{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
2196
'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
2197
'zonediff': '+09:00'}
2198
if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
2199
return _parse_date_w3dtf(w3dtfdate)
2200
registerDateHandler(_parse_date_nate)
2203
re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
2204
def _parse_date_mssql(dateString):
2205
'''Parse a string according to the MS SQL date format'''
2206
m = _mssql_date_re.match(dateString)
2208
w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
2209
{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
2210
'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
2211
'zonediff': '+09:00'}
2212
if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
2213
return _parse_date_w3dtf(w3dtfdate)
2214
registerDateHandler(_parse_date_mssql)
2216
# Unicode strings for Greek date strings
2219
u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7
2220
u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7
2221
u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7
2222
u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7
2223
u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7
2224
u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7
2225
u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7
2226
u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7
2227
u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
2228
u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7
2229
u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
2230
u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7
2231
u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7
2232
u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7
2233
u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7
2234
u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7
2235
u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7
2236
u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7
2237
u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7
2242
u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
2243
u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
2244
u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
2245
u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
2246
u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
2247
u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
2248
u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
2251
_greek_date_format_re = \
2252
re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
2254
def _parse_date_greek(dateString):
2255
'''Parse a string according to a Greek 8-bit date format.'''
2256
m = _greek_date_format_re.match(dateString)
2259
wday = _greek_wdays[m.group(1)]
2260
month = _greek_months[m.group(3)]
2261
except (SystemExit, KeyboardInterrupt):
2265
rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
2266
{'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
2267
'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
2268
'zonediff': m.group(8)}
2269
if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
2270
return _parse_date_rfc822(rfc822date)
2271
registerDateHandler(_parse_date_greek)
2273
# Unicode strings for Hungarian date strings
2274
_hungarian_months = \
2276
u'janu\u00e1r': u'01', # e1 in iso-8859-2
2277
u'febru\u00e1ri': u'02', # e1 in iso-8859-2
2278
u'm\u00e1rcius': u'03', # e1 in iso-8859-2
2279
u'\u00e1prilis': u'04', # e1 in iso-8859-2
2280
u'm\u00e1ujus': u'05', # e1 in iso-8859-2
2281
u'j\u00fanius': u'06', # fa in iso-8859-2
2282
u'j\u00falius': u'07', # fa in iso-8859-2
2283
u'augusztus': u'08',
2284
u'szeptember': u'09',
2285
u'okt\u00f3ber': u'10', # f3 in iso-8859-2
2290
_hungarian_date_format_re = \
2291
re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
2293
def _parse_date_hungarian(dateString):
2294
'''Parse a string according to a Hungarian 8-bit date format.'''
2295
m = _hungarian_date_format_re.match(dateString)
2298
month = _hungarian_months[m.group(2)]
2305
except (SystemExit, KeyboardInterrupt):
2309
w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
2310
{'year': m.group(1), 'month': month, 'day': day,\
2311
'hour': hour, 'minute': m.group(5),\
2312
'zonediff': m.group(6)}
2313
if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
2314
return _parse_date_w3dtf(w3dtfdate)
2315
registerDateHandler(_parse_date_hungarian)
2317
# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
2318
# Drake and licensed under the Python license. Removed all range checking
2319
# for month, day, hour, minute, and second, since mktime will normalize
2321
def _parse_date_w3dtf(dateString):
2322
def __extract_date(m):
2323
year = int(m.group('year'))
2325
year = 100 * int(time.gmtime()[0] / 100) + int(year)
2328
julian = m.group('julian')
2330
julian = int(julian)
2331
month = julian / 30 + 1
2332
day = julian % 30 + 1
2334
while jday != julian:
2335
t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
2336
jday = time.gmtime(t)[-2]
2337
diff = abs(jday - julian)
2349
return year, month, day
2350
month = m.group('month')
2356
day = m.group('day')
2361
return year, month, day
2363
def __extract_time(m):
2366
hours = m.group('hours')
2370
minutes = int(m.group('minutes'))
2371
seconds = m.group('seconds')
2373
seconds = int(seconds)
2376
return hours, minutes, seconds
2378
def __extract_tzd(m):
2379
'''Return the Time Zone Designator as an offset in seconds from UTC.'''
2382
tzd = m.group('tzd')
2387
hours = int(m.group('tzdhours'))
2388
minutes = m.group('tzdminutes')
2390
minutes = int(minutes)
2393
offset = (hours*60 + minutes) * 60
2398
__date_re = ('(?P<year>\d\d\d\d)'
2400
'(?:(?P<julian>\d\d\d)'
2401
'|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?')
2402
__tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'
2403
__tzd_rx = re.compile(__tzd_re)
2404
__time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
2405
'(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?'
2407
__datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
2408
__datetime_rx = re.compile(__datetime_re)
2409
m = __datetime_rx.match(dateString)
2410
if (m is None) or (m.group() != dateString): return
2411
gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
2412
if gmt[0] == 0: return
2413
return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
2414
registerDateHandler(_parse_date_w3dtf)
2416
def _parse_date_rfc822(dateString):
2417
'''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
2418
data = dateString.split()
2419
if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
2425
data[3:] = [s[:i], s[i+1:]]
2428
dateString = " ".join(data)
2430
dateString += ' 00:00:00 GMT'
2431
tm = rfc822.parsedate_tz(dateString)
2433
return time.gmtime(rfc822.mktime_tz(tm))
2434
# rfc822.py defines several time zones, but we define some extra ones.
2435
# 'ET' is equivalent to 'EST', etc.
2436
_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
2437
rfc822._timezones.update(_additional_timezones)
2438
registerDateHandler(_parse_date_rfc822)
2440
def _parse_date(dateString):
2441
'''Parses a variety of date formats into a 9-tuple in GMT'''
2442
for handler in _date_handlers:
2444
date9tuple = handler(dateString)
2445
if not date9tuple: continue
2446
if len(date9tuple) != 9:
2447
if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
2449
map(int, date9tuple)
2451
except Exception, e:
2452
if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
2456
def _getCharacterEncoding(http_headers, xml_data):
2457
'''Get the character encoding of the XML document
2459
http_headers is a dictionary
2460
xml_data is a raw string (not Unicode)
2462
This is so much trickier than it sounds, it's not even funny.
2463
According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
2464
is application/xml, application/*+xml,
2465
application/xml-external-parsed-entity, or application/xml-dtd,
2466
the encoding given in the charset parameter of the HTTP Content-Type
2467
takes precedence over the encoding given in the XML prefix within the
2468
document, and defaults to 'utf-8' if neither are specified. But, if
2469
the HTTP Content-Type is text/xml, text/*+xml, or
2470
text/xml-external-parsed-entity, the encoding given in the XML prefix
2471
within the document is ALWAYS IGNORED and only the encoding given in
2472
the charset parameter of the HTTP Content-Type header should be
2473
respected, and it defaults to 'us-ascii' if not specified.
2475
Furthermore, discussion on the atom-syntax mailing list with the
2476
author of RFC 3023 leads me to the conclusion that any document
2477
served with a Content-Type of text/* and no charset parameter
2478
must be treated as us-ascii. (We now do this.) And also that it
2479
must always be flagged as non-well-formed. (We now do this too.)
2481
If Content-Type is unspecified (input was local file or non-HTTP source)
2482
or unrecognized (server just got it totally wrong), then go by the
2483
encoding given in the XML prefix of the document and default to
2484
'iso-8859-1' as per the HTTP specification (RFC 2616).
2486
Then, assuming we didn't find a character encoding in the HTTP headers
2487
(and the HTTP Content-type allowed us to look in the body), we need
2488
to sniff the first few bytes of the XML data and try to determine
2489
whether the encoding is ASCII-compatible. Section F of the XML
2490
specification shows the way here:
2491
http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
2493
If the sniffed encoding is not ASCII-compatible, we need to make it
2494
ASCII compatible so that we can sniff further into the XML declaration
2495
to find the encoding attribute, which will tell us the true encoding.
2497
Of course, none of this guarantees that we will be able to parse the
2498
feed in the declared character encoding (assuming it was declared
2499
correctly, which many are not). CJKCodecs and iconv_codec help a lot;
2500
you should definitely install them if you can.
2501
http://cjkpython.i18n.org/
2504
def _parseHTTPContentType(content_type):
2505
'''takes HTTP Content-Type header and returns (content type, charset)
2507
If no charset is specified, returns (content type, '')
2508
If no content type is specified, returns ('', '')
2509
Both return parameters are guaranteed to be lowercase strings
2511
content_type = content_type or ''
2512
content_type, params = cgi.parse_header(content_type)
2513
return content_type, params.get('charset', '').replace("'", '')
2515
sniffed_xml_encoding = ''
2518
http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))
2519
# Must sniff for non-ASCII-compatible character encodings before
2520
# searching for XML declaration. This heuristic is defined in
2521
# section F of the XML specification:
2522
# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
2524
if xml_data[:4] == '\x4c\x6f\xa7\x94':
2526
xml_data = _ebcdic_to_ascii(xml_data)
2527
elif xml_data[:4] == '\x00\x3c\x00\x3f':
2529
sniffed_xml_encoding = 'utf-16be'
2530
xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
2531
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):
2533
sniffed_xml_encoding = 'utf-16be'
2534
xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
2535
elif xml_data[:4] == '\x3c\x00\x3f\x00':
2537
sniffed_xml_encoding = 'utf-16le'
2538
xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
2539
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):
2541
sniffed_xml_encoding = 'utf-16le'
2542
xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
2543
elif xml_data[:4] == '\x00\x00\x00\x3c':
2545
sniffed_xml_encoding = 'utf-32be'
2546
xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
2547
elif xml_data[:4] == '\x3c\x00\x00\x00':
2549
sniffed_xml_encoding = 'utf-32le'
2550
xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
2551
elif xml_data[:4] == '\x00\x00\xfe\xff':
2553
sniffed_xml_encoding = 'utf-32be'
2554
xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
2555
elif xml_data[:4] == '\xff\xfe\x00\x00':
2557
sniffed_xml_encoding = 'utf-32le'
2558
xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
2559
elif xml_data[:3] == '\xef\xbb\xbf':
2561
sniffed_xml_encoding = 'utf-8'
2562
xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
2566
xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
2567
except (SystemExit, KeyboardInterrupt):
2570
xml_encoding_match = None
2571
if xml_encoding_match:
2572
xml_encoding = xml_encoding_match.groups()[0].lower()
2573
if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
2574
xml_encoding = sniffed_xml_encoding
2575
acceptable_content_type = 0
2576
application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
2577
text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
2578
if (http_content_type in application_content_types) or \
2579
(http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
2580
acceptable_content_type = 1
2581
true_encoding = http_encoding or xml_encoding or 'utf-8'
2582
elif (http_content_type in text_content_types) or \
2583
(http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
2584
acceptable_content_type = 1
2585
true_encoding = http_encoding or 'us-ascii'
2586
elif http_content_type.startswith('text/'):
2587
true_encoding = http_encoding or 'us-ascii'
2588
elif http_headers and (not http_headers.has_key('content-type')):
2589
true_encoding = xml_encoding or 'iso-8859-1'
2591
true_encoding = xml_encoding or 'utf-8'
2592
return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
2594
def _toUTF8(data, encoding):
2595
'''Changes an XML data stream on the fly to specify a new encoding
2597
data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
2598
encoding is a string recognized by encodings.aliases
2600
if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
2601
# strip Byte Order Mark (if present)
2602
if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):
2604
sys.stderr.write('stripping BOM\n')
2605
if encoding != 'utf-16be':
2606
sys.stderr.write('trying utf-16be instead\n')
2607
encoding = 'utf-16be'
2609
elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
2611
sys.stderr.write('stripping BOM\n')
2612
if encoding != 'utf-16le':
2613
sys.stderr.write('trying utf-16le instead\n')
2614
encoding = 'utf-16le'
2616
elif data[:3] == '\xef\xbb\xbf':
2618
sys.stderr.write('stripping BOM\n')
2619
if encoding != 'utf-8':
2620
sys.stderr.write('trying utf-8 instead\n')
2623
elif data[:4] == '\x00\x00\xfe\xff':
2625
sys.stderr.write('stripping BOM\n')
2626
if encoding != 'utf-32be':
2627
sys.stderr.write('trying utf-32be instead\n')
2628
encoding = 'utf-32be'
2630
elif data[:4] == '\xff\xfe\x00\x00':
2632
sys.stderr.write('stripping BOM\n')
2633
if encoding != 'utf-32le':
2634
sys.stderr.write('trying utf-32le instead\n')
2635
encoding = 'utf-32le'
2637
newdata = unicode(data, encoding)
2638
if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
2639
declmatch = re.compile('^<\?xml[^>]*?>')
2640
newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
2641
if declmatch.search(newdata):
2642
newdata = declmatch.sub(newdecl, newdata)
2644
newdata = newdecl + u'\n' + newdata
2645
return newdata.encode('utf-8')
2647
def _stripDoctype(data):
2648
'''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
2650
rss_version may be 'rss091n' or None
2651
stripped_data is the same XML document, minus the DOCTYPE
2653
entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE)
2654
data = entity_pattern.sub('', data)
2655
doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)
2656
doctype_results = doctype_pattern.findall(data)
2657
doctype = doctype_results and doctype_results[0] or ''
2658
if doctype.lower().count('netscape'):
2662
data = doctype_pattern.sub('', data)
2663
return version, data
2665
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
2666
'''Parse a feed from a URL, file, stream, or string'''
2667
result = FeedParserDict()
2668
result['feed'] = FeedParserDict()
2669
result['entries'] = []
2672
if type(handlers) == types.InstanceType:
2673
handlers = [handlers]
2675
f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
2677
except Exception, e:
2679
result['bozo_exception'] = e
2683
# if feed is gzip-compressed, decompress it
2684
if f and data and hasattr(f, 'headers'):
2685
if gzip and f.headers.get('content-encoding', '') == 'gzip':
2687
data = gzip.GzipFile(fileobj=_StringIO(data)).read()
2688
except Exception, e:
2689
# Some feeds claim to be gzipped but they're not, so
2690
# we get garbage. Ideally, we should re-request the
2691
# feed without the 'Accept-encoding: gzip' header,
2694
result['bozo_exception'] = e
2696
elif zlib and f.headers.get('content-encoding', '') == 'deflate':
2698
data = zlib.decompress(data, -zlib.MAX_WBITS)
2699
except Exception, e:
2701
result['bozo_exception'] = e
2705
if hasattr(f, 'info'):
2707
result['etag'] = info.getheader('ETag')
2708
last_modified = info.getheader('Last-Modified')
2710
result['modified'] = _parse_date(last_modified)
2711
if hasattr(f, 'url'):
2712
result['href'] = f.url
2713
result['status'] = 200
2714
if hasattr(f, 'status'):
2715
result['status'] = f.status
2716
if hasattr(f, 'headers'):
2717
result['headers'] = f.headers.dict
2718
if hasattr(f, 'close'):
2721
# there are four encodings to keep track of:
2722
# - http_encoding is the encoding declared in the Content-Type HTTP header
2723
# - xml_encoding is the encoding declared in the <?xml declaration
2724
# - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
2725
# - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
2726
http_headers = result.get('headers', {})
2727
result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
2728
_getCharacterEncoding(http_headers, data)
2729
if http_headers and (not acceptable_content_type):
2730
if http_headers.has_key('content-type'):
2731
bozo_message = '%s is not an XML media type' % http_headers['content-type']
2733
bozo_message = 'no Content-type specified'
2735
result['bozo_exception'] = NonXMLContentType(bozo_message)
2737
result['version'], data = _stripDoctype(data)
2739
baseuri = http_headers.get('content-location', result.get('href'))
2740
baselang = http_headers.get('content-language', None)
2742
# if server sent 304, we're done
2743
if result.get('status', 0) == 304:
2744
result['version'] = ''
2745
result['debug_message'] = 'The feed has not changed since you last checked, ' + \
2746
'so the server sent no data. This is a feature, not a bug!'
2749
# if there was a problem downloading, we're done
2753
# determine character encoding
2754
use_strict_parser = 0
2756
tried_encodings = []
2757
# try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
2758
for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
2759
if not proposed_encoding: continue
2760
if proposed_encoding in tried_encodings: continue
2761
tried_encodings.append(proposed_encoding)
2763
data = _toUTF8(data, proposed_encoding)
2764
known_encoding = use_strict_parser = 1
2766
except (SystemExit, KeyboardInterrupt):
2770
# if no luck and we have auto-detection library, try that
2771
if (not known_encoding) and chardet:
2773
proposed_encoding = chardet.detect(data)['encoding']
2774
if proposed_encoding and (proposed_encoding not in tried_encodings):
2775
tried_encodings.append(proposed_encoding)
2776
data = _toUTF8(data, proposed_encoding)
2777
known_encoding = use_strict_parser = 1
2778
except (SystemExit, KeyboardInterrupt):
2782
# if still no luck and we haven't tried utf-8 yet, try that
2783
if (not known_encoding) and ('utf-8' not in tried_encodings):
2785
proposed_encoding = 'utf-8'
2786
tried_encodings.append(proposed_encoding)
2787
data = _toUTF8(data, proposed_encoding)
2788
known_encoding = use_strict_parser = 1
2789
except (SystemExit, KeyboardInterrupt):
2793
# if still no luck and we haven't tried windows-1252 yet, try that
2794
if (not known_encoding) and ('windows-1252' not in tried_encodings):
2796
proposed_encoding = 'windows-1252'
2797
tried_encodings.append(proposed_encoding)
2798
data = _toUTF8(data, proposed_encoding)
2799
known_encoding = use_strict_parser = 1
2800
except (SystemExit, KeyboardInterrupt):
2804
# if still no luck, give up
2805
if not known_encoding:
2807
result['bozo_exception'] = CharacterEncodingUnknown( \
2808
'document encoding unknown, I tried ' + \
2809
'%s, %s, utf-8, and windows-1252 but nothing worked' % \
2810
(result['encoding'], xml_encoding))
2811
result['encoding'] = ''
2812
elif proposed_encoding != result['encoding']:
2814
result['bozo_exception'] = CharacterEncodingOverride( \
2815
'documented declared as %s, but parsed as %s' % \
2816
(result['encoding'], proposed_encoding))
2817
result['encoding'] = proposed_encoding
2819
if not _XML_AVAILABLE:
2820
use_strict_parser = 0
2821
if use_strict_parser:
2822
# initialize the SAX parser
2823
feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
2824
saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
2825
saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
2826
saxparser.setContentHandler(feedparser)
2827
saxparser.setErrorHandler(feedparser)
2828
source = xml.sax.xmlreader.InputSource()
2829
source.setByteStream(_StringIO(data))
2830
if hasattr(saxparser, '_ns_stack'):
2831
# work around bug in built-in SAX parser (doesn't recognize xml: namespace)
2832
# PyXML doesn't have this problem, and it doesn't have _ns_stack either
2833
saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
2835
saxparser.parse(source)
2836
except Exception, e:
2839
traceback.print_stack()
2840
traceback.print_exc()
2841
sys.stderr.write('xml parsing failed\n')
2843
result['bozo_exception'] = feedparser.exc or e
2844
use_strict_parser = 0
2845
if not use_strict_parser:
2846
feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '')
2847
feedparser.feed(data)
2848
result['feed'] = feedparser.feeddata
2849
result['entries'] = feedparser.entries
2850
result['version'] = result['version'] or feedparser.version
2851
result['namespaces'] = feedparser.namespacesInUse
2854
if __name__ == '__main__':
2855
if not sys.argv[1:]:
2860
zopeCompatibilityHack()
2861
from pprint import pprint
2870
#1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
2871
# added Simon Fell's test suite
2872
#1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections
2874
# JD - use inchannel to watch out for image and textinput elements which can
2875
# also contain title, link, and description elements
2876
# JD - check for isPermaLink='false' attribute on guid elements
2877
# JD - replaced openAnything with open_resource supporting ETag and
2878
# If-Modified-Since request headers
2879
# JD - parse now accepts etag, modified, agent, and referrer optional
2881
# JD - modified parse to return a dictionary instead of a tuple so that any
2882
# etag or modified information can be returned and cached by the caller
2883
#2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything
2884
# because of etag/modified, return the old etag/modified to the caller to
2885
# indicate why nothing is being returned
2886
#2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its
2887
# useless. Fixes the problem JD was addressing by adding it.
2888
#2.1 - 11/14/2002 - MAP - added gzip support
2889
#2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
2890
# start_admingeneratoragent is an example of how to handle elements with
2891
# only attributes, no content.
2892
#2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
2893
# also, make sure we send the User-Agent even if urllib2 isn't available.
2894
# Match any variation of backend.userland.com/rss namespace.
2895
#2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
2896
#2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
2897
# snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed
2899
#2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
2900
# removed unnecessary urllib code -- urllib2 should always be available anyway;
2901
# return actual url, status, and full HTTP headers (as result['url'],
2902
# result['status'], and result['headers']) if parsing a remote feed over HTTP --
2903
# this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;
2904
# added the latest namespace-of-the-week for RSS 2.0
2905
#2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
2906
# User-Agent (otherwise urllib2 sends two, which confuses some servers)
2907
#2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
2908
# inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds
2909
#2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
2910
# textInput, and also to return the character encoding (if specified)
2911
#2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking
2912
# nested divs within content (JohnD); fixed missing sys import (JohanS);
2913
# fixed regular expression to capture XML character encoding (Andrei);
2914
# added support for Atom 0.3-style links; fixed bug with textInput tracking;
2915
# added support for cloud (MartijnP); added support for multiple
2916
# category/dc:subject (MartijnP); normalize content model: 'description' gets
2917
# description (which can come from description, summary, or full content if no
2918
# description), 'content' gets dict of base/language/type/value (which can come
2919
# from content:encoded, xhtml:body, content, or fullitem);
2920
# fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang
2921
# tracking; fixed bug tracking unknown tags; fixed bug tracking content when
2922
# <content> element is not in default namespace (like Pocketsoap feed);
2923
# resolve relative URLs in link, guid, docs, url, comments, wfw:comment,
2924
# wfw:commentRSS; resolve relative URLs within embedded HTML markup in
2925
# description, xhtml:body, content, content:encoded, title, subtitle,
2926
# summary, info, tagline, and copyright; added support for pingback and
2927
# trackback namespaces
2928
#2.7 - 1/5/2004 - MAP - really added support for trackback and pingback
2929
# namespaces, as opposed to 2.6 when I said I did but didn't really;
2930
# sanitize HTML markup within some elements; added mxTidy support (if
2931
# installed) to tidy HTML markup within some elements; fixed indentation
2932
# bug in _parse_date (FazalM); use socket.setdefaulttimeout if available
2933
# (FazalM); universal date parsing and normalization (FazalM): 'created', modified',
2934
# 'issued' are parsed into 9-tuple date format and stored in 'created_parsed',
2935
# 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'
2936
# and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa
2937
#2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory
2938
# leak not closing url opener (JohnD); added dc:publisher support (MarekK);
2939
# added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)
2940
#2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in
2941
# encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);
2942
# fixed relative URI processing for guid (skadz); added ICBM support; added
2944
#2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
2945
# blogspot.com sites); added _debug variable
2946
#2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
2947
#3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);
2948
# added several new supported namespaces; fixed bug tracking naked markup in
2949
# description; added support for enclosure; added support for source; re-added
2950
# support for cloud which got dropped somehow; added support for expirationDate
2951
#3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking
2952
# xml:base URI, one for documents that don't define one explicitly and one for
2953
# documents that define an outer and an inner xml:base that goes out of scope
2954
# before the end of the document
2955
#3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level
2956
#3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version']
2957
# will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;
2958
# added support for creativeCommons:license and cc:license; added support for
2959
# full Atom content model in title, tagline, info, copyright, summary; fixed bug
2960
# with gzip encoding (not always telling server we support it when we do)
2961
#3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail
2962
# (dictionary of 'name', 'url', 'email'); map author to author_detail if author
2963
# contains name + email address
2964
#3.0b8 - 1/28/2004 - MAP - added support for contributor
2965
#3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added
2966
# support for summary
2967
#3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from
2969
#3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain
2970
# dangerous markup; fiddled with decodeEntities (not right); liberalized
2971
# date parsing even further
2972
#3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);
2973
# added support to Atom 0.2 subtitle; added support for Atom content model
2974
# in copyright; better sanitizing of dangerous HTML elements with end tags
2975
# (script, frameset)
2976
#3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,
2977
# etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />)
2978
#3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under
2980
#3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;
2981
# fixed bug capturing author and contributor URL; fixed bug resolving relative
2982
# links in author and contributor URL; fixed bug resolvin relative links in
2983
# generator URL; added support for recognizing RSS 1.0; passed Simon Fell's
2984
# namespace tests, and included them permanently in the test suite with his
2985
# permission; fixed namespace handling under Python 2.1
2986
#3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)
2987
#3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023
2988
#3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);
2989
# use libxml2 (if available)
2990
#3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author
2991
# name was in parentheses; removed ultra-problematic mxTidy support; patch to
2992
# workaround crash in PyXML/expat when encountering invalid entities
2993
# (MarkMoraes); support for textinput/textInput
2994
#3.0b20 - 4/7/2004 - MAP - added CDF support
2995
#3.0b21 - 4/14/2004 - MAP - added Hot RSS support
2996
#3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in
2997
# results dict; changed results dict to allow getting values with results.key
2998
# as well as results[key]; work around embedded illformed HTML with half
2999
# a DOCTYPE; work around malformed Content-Type header; if character encoding
3000
# is wrong, try several common ones before falling back to regexes (if this
3001
# works, bozo_exception is set to CharacterEncodingOverride); fixed character
3002
# encoding issues in BaseHTMLProcessor by tracking encoding and converting
3003
# from Unicode to raw strings before feeding data to sgmllib.SGMLParser;
3004
# convert each value in results to Unicode (if possible), even if using
3005
# regex-based parsing
3006
#3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain
3007
# high-bit characters in attributes in embedded HTML in description (thanks
3008
# Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in
3009
# FeedParserDict; tweaked FeedParserDict.has_key to return True if asking
3010
# about a mapped key
3011
#3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and
3012
# results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could
3013
# cause the same encoding to be tried twice (even if it failed the first time);
3014
# fixed DOCTYPE stripping when DOCTYPE contained entity declarations;
3015
# better textinput and image tracking in illformed RSS 1.0 feeds
3016
#3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed
3017
# my blink tag tests
3018
#3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that
3019
# failed to parse utf-16 encoded feeds; made source into a FeedParserDict;
3020
# duplicate admin:generatorAgent/@rdf:resource in generator_detail.url;
3021
# added support for image; refactored parse() fallback logic to try other
3022
# encodings if SAX parsing fails (previously it would only try other encodings
3023
# if re-encoding failed); remove unichr madness in normalize_attrs now that
3024
# we're properly tracking encoding in and out of BaseHTMLProcessor; set
3025
# feed.language from root-level xml:lang; set entry.id from rdf:about;
3026
# send Accept header
3027
#3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between
3028
# iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are
3029
# windows-1252); fixed regression that could cause the same encoding to be
3030
# tried twice (even if it failed the first time)
3031
#3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types;
3032
# recover from malformed content-type header parameter with no equals sign
3033
# ('text/xml; charset:iso-8859-1')
3034
#3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities
3035
# to Unicode equivalents in illformed feeds (aaronsw); added and
3036
# passed tests for converting character entities to Unicode equivalents
3037
# in illformed feeds (aaronsw); test for valid parsers when setting
3038
# XML_AVAILABLE; make version and encoding available when server returns
3039
# a 304; add handlers parameter to pass arbitrary urllib2 handlers (like
3040
# digest auth or proxy support); add code to parse username/password
3041
# out of url and send as basic authentication; expose downloading-related
3042
# exceptions in bozo_exception (aaronsw); added __contains__ method to
3043
# FeedParserDict (aaronsw); added publisher_detail (aaronsw)
3044
#3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always
3045
# convert feed to UTF-8 before passing to XML parser; completely revamped
3046
# logic for determining character encoding and attempting XML parsing
3047
# (much faster); increased default timeout to 20 seconds; test for presence
3048
# of Location header on redirects; added tests for many alternate character
3049
# encodings; support various EBCDIC encodings; support UTF-16BE and
3050
# UTF16-LE with or without a BOM; support UTF-8 with a BOM; support
3051
# UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no
3052
# XML parsers are available; added support for 'Content-encoding: deflate';
3053
# send blank 'Accept-encoding: ' header if neither gzip nor zlib modules
3055
#3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure
3056
# problem tracking xml:base and xml:lang if element declares it, child
3057
# doesn't, first grandchild redeclares it, and second grandchild doesn't;
3058
# refactored date parsing; defined public registerDateHandler so callers
3059
# can add support for additional date formats at runtime; added support
3060
# for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added
3061
# zopeCompatibilityHack() which turns FeedParserDict into a regular
3062
# dictionary, required for Zope compatibility, and also makes command-
3063
# line debugging easier because pprint module formats real dictionaries
3064
# better than dictionary-like objects; added NonXMLContentType exception,
3065
# which is stored in bozo_exception when a feed is served with a non-XML
3066
# media type such as 'text/plain'; respect Content-Language as default
3067
# language if not xml:lang is present; cloud dict is now FeedParserDict;
3068
# generator dict is now FeedParserDict; better tracking of xml:lang,
3069
# including support for xml:lang='' to unset the current language;
3070
# recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default
3071
# namespace; don't overwrite final status on redirects (scenarios:
3072
# redirecting to a URL that returns 304, redirecting to a URL that
3073
# redirects to another URL with a different type of redirect); add
3074
# support for HTTP 303 redirects
3075
#4.0 - MAP - support for relative URIs in xml:base attribute; fixed
3076
# encoding issue with mxTidy (phopkins); preliminary support for RFC 3229;
3077
# support for Atom 1.0; support for iTunes extensions; new 'tags' for
3078
# categories/keywords/etc. as array of dict
3079
# {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0
3080
# terminology; parse RFC 822-style dates with no time; lots of other
3082
#4.1 - MAP - removed socket timeout; added support for chardet library