1
"""Universal feed parser
3
Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
5
Visit http://feedparser.org/ for the latest version
6
Visit http://feedparser.org/docs/ for the latest documentation
8
Required: Python 2.1 or later
9
Recommended: Python 2.3 or later
10
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
13
__version__ = "4.1"# + "$Revision$"[11:15] + "-cvs"
14
__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
16
Redistribution and use in source and binary forms, with or without modification,
17
are permitted provided that the following conditions are met:
19
* Redistributions of source code must retain the above copyright notice,
20
this list of conditions and the following disclaimer.
21
* Redistributions in binary form must reproduce the above copyright notice,
22
this list of conditions and the following disclaimer in the documentation
23
and/or other materials provided with the distribution.
25
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
26
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
29
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35
POSSIBILITY OF SUCH DAMAGE.
37
__author__ = "Mark Pilgrim <http://diveintomark.org/>"
38
__contributors__ = ["Jason Diamond <http://injektilo.org/>",
39
"John Beimler <http://john.beimler.org/>",
40
"Fazal Majid <http://www.majid.info/mylos/weblog/>",
41
"Aaron Swartz <http://aaronsw.com/>",
42
"Kevin Marks <http://epeus.blogspot.com/>"]
45
# HTTP "User-Agent" header to send to servers when downloading feeds.
46
# If you are embedding feedparser in a larger application, you should
47
# change this to your application name and URL.
48
USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
49
from miro import config
50
from miro import prefs
51
USER_AGENT += " %s/%s (%s)" % \
52
(config.get(prefs.SHORT_APP_NAME),
53
config.get(prefs.APP_VERSION),
54
config.get(prefs.PROJECT_URL))
56
# HTTP "Accept" header to send to servers when downloading feeds. If you don't
57
# want to send an Accept header, set this to None.
58
ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
60
# List of preferred XML parsers, by SAX driver name. These will be tried first,
61
# but if they're not installed, Python will keep searching through its own list
62
# of pre-installed parsers until it finds one that supports everything we need.
63
PREFERRED_XML_PARSERS = ["drv_libxml2"]
65
# If you want feedparser to automatically run HTML markup through HTML Tidy, set
66
# this to 1. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
67
# or utidylib <http://utidylib.berlios.de/>.
70
# List of Python interfaces for HTML Tidy, in order of preference. Only useful
72
PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
74
# ---------- required modules (should come with any Python distribution) ----------
75
import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
77
from cStringIO import StringIO as _StringIO
79
from StringIO import StringIO as _StringIO
81
# ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
83
# gzip is included with most Python distributions, but may not be available if you compiled your own
93
# If a real XML parser is available, feedparser will attempt to use it. feedparser has
94
# been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the
95
# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
96
# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
99
xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
100
from xml.sax.saxutils import escape as _xmlescape
104
def _xmlescape(data):
105
data = data.replace('&', '&')
106
data = data.replace('>', '>')
107
data = data.replace('<', '<')
110
# base64 support for Atom feeds that contain embedded binary data
112
import base64, binascii
114
base64 = binascii = None
116
# cjkcodecs and iconv_codec provide support for more character encodings.
117
# Both are available from http://cjkpython.i18n.org/
119
import cjkcodecs.aliases
127
# chardet library auto-detects character encodings
128
# Download from http://chardet.feedparser.org/
132
import chardet.constants
133
chardet.constants._debug = 1
137
# ---------- don't touch these ----------
138
class ThingsNobodyCaresAboutButMe(Exception): pass
139
class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
140
class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
141
class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
142
class UndeclaredNamespace(Exception): pass
144
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
145
sgmllib.special = re.compile('<!')
146
sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]')
148
SUPPORTED_VERSIONS = {'': 'unknown',
149
'rss090': 'RSS 0.90',
150
'rss091n': 'RSS 0.91 (Netscape)',
151
'rss091u': 'RSS 0.91 (Userland)',
152
'rss092': 'RSS 0.92',
153
'rss093': 'RSS 0.93',
154
'rss094': 'RSS 0.94',
157
'rss': 'RSS (unknown version)',
158
'atom01': 'Atom 0.1',
159
'atom02': 'Atom 0.2',
160
'atom03': 'Atom 0.3',
161
'atom10': 'Atom 1.0',
162
'atom': 'Atom (unknown version)',
170
# Python 2.1 does not have dict
171
from UserDict import UserDict
178
def _entry_equal(a, b):
179
if type(a) == list and type(b) == list:
182
for i in xrange (len(a)):
183
if not _entry_equal(a[i], b[i]):
188
except (SystemExit, KeyboardInterrupt):
193
except (SystemExit, KeyboardInterrupt):
198
class FeedParserDict(UserDict):
199
# This is a complete hack to prevent problems if data is saved with a
200
# newer version of Miro and an older version of Miro tries to open it.
201
# See storedatabase.py for more info.
202
__module__ = 'feedparser'
204
# values of keymap are in order of preference. for example,
205
# in description, summary is preferred to subtitle.
206
keymap = {'channel': 'feed',
209
'length': 'filesize',
210
'image': 'thumbnail',
212
'date_parsed': 'updated_parsed',
213
'description': ('summary', 'subtitle'),
215
'modified': 'updated',
216
'modified_parsed': 'updated_parsed',
217
'issued': 'published',
218
'issued_parsed': 'published_parsed',
219
'copyright': 'rights',
220
'copyright_detail': 'rights_detail',
221
'tagline': 'subtitle',
222
'tagline_detail': 'subtitle_detail'}
226
if isinstance(keymap[key], tuple):
227
for k in keymap[key]:
228
reverse_keymap[k] = key
230
reverse_keymap[keymap[key]] = key
232
def __init__(self, initialData=None):
233
if isinstance(initialData, dict):
234
UserDict.__init__(self)
235
for key in initialData:
236
self[key] = initialData[key]
237
elif initialData is not None:
238
UserDict.__init__(self, initialData)
240
UserDict.__init__(self)
242
def reverse_key (self, key):
243
if self.reverse_keymap.has_key(key):
244
return self.reverse_keymap[key]
250
def __init__ (self, container):
251
self.container = container
252
self.subiter = UserDict.__iter__(container)
256
return self.container.reverse_key(self.subiter.next())
257
return ExtendedIter (self)
259
def equal(self, other):
261
iter = other.get_iter()
262
except StandardError:
263
iter = other.__iter__()
267
if not _entry_equal(self[key], other[key]):
270
for key in self.get_iter():
271
if not checked.has_key(key):
274
except StandardError:
277
def __getitem__(self, key):
278
if key == 'category':
279
return UserDict.__getitem__(self, 'tags')[0]['term']
280
if key == 'categories':
281
return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
282
realkey = self.keymap.get(key, key)
283
if isinstance(realkey, tuple):
285
if UserDict.has_key(self, k):
286
return UserDict.__getitem__(self, k)
287
if UserDict.has_key(self, key):
288
return UserDict.__getitem__(self, key)
289
return UserDict.__getitem__(self, realkey)
291
def __setitem__(self, key, value):
292
for k in self.keymap.keys():
295
if isinstance(key, tuple):
297
return UserDict.__setitem__(self, key, value)
299
def get(self, key, default=None):
300
if self.has_key(key):
305
def setdefault(self, key, value):
306
if not self.has_key(key):
310
def has_key(self, key):
312
return hasattr(self, key) or UserDict.has_key(self, key)
313
except AttributeError:
316
def __getattr__(self, key):
318
assert not key.startswith('_')
319
return self.__getitem__(key)
320
except (SystemExit, KeyboardInterrupt):
323
raise AttributeError, "object has no attribute '%s'" % key
325
def __setattr__(self, key, value):
326
if key.startswith('_') or key == 'data':
327
self.__dict__[key] = value
329
return self.__setitem__(key, value)
331
def __contains__(self, key):
332
return self.has_key(key)
334
def zopeCompatibilityHack():
335
global FeedParserDict
337
def FeedParserDict(aDict=None):
343
_ebcdic_to_ascii_map = None
344
def _ebcdic_to_ascii(s):
345
global _ebcdic_to_ascii_map
346
if not _ebcdic_to_ascii_map:
348
0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
349
16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
350
128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
351
144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
352
32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
353
38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
354
45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
355
186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
356
195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
357
202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
358
209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
359
216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
360
123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
361
125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
362
92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
363
48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
366
_ebcdic_to_ascii_map = string.maketrans( \
367
''.join(map(chr, range(256))), ''.join(map(chr, emap)))
368
return s.translate(_ebcdic_to_ascii_map)
370
_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
371
def _urljoin(base, uri):
372
uri = _urifixer.sub(r'\1\3', uri)
373
return urlparse.urljoin(base, uri)
375
class _FeedParserMixin:
376
namespaces = {'': '',
377
'http://backend.userland.com/rss': '',
378
'http://blogs.law.harvard.edu/tech/rss': '',
379
'http://purl.org/rss/1.0/': '',
380
'http://my.netscape.com/rdf/simple/0.9/': '',
381
'http://example.com/newformat#': '',
382
'http://example.com/necho': '',
383
'http://purl.org/echo/': '',
384
'uri/of/echo/namespace#': '',
385
'http://purl.org/pie/': '',
386
'http://purl.org/atom/ns#': '',
387
'http://www.w3.org/2005/Atom': '',
388
'http://purl.org/rss/1.0/modules/rss091#': '',
390
'http://webns.net/mvcb/': 'admin',
391
'http://purl.org/rss/1.0/modules/aggregation/': 'ag',
392
'http://purl.org/rss/1.0/modules/annotate/': 'annotate',
393
'http://media.tangent.org/rss/1.0/': 'audio',
394
'http://backend.userland.com/blogChannelModule': 'blogChannel',
395
'http://web.resource.org/cc/': 'cc',
396
'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
397
'http://purl.org/rss/1.0/modules/company': 'co',
398
'http://purl.org/rss/1.0/modules/content/': 'content',
399
'http://my.theinfo.org/changed/1.0/rss/': 'cp',
400
'http://purl.org/dc/elements/1.1/': 'dc',
401
'http://purl.org/dc/terms/': 'dcterms',
402
'http://purl.org/rss/1.0/modules/email/': 'email',
403
'http://purl.org/rss/1.0/modules/event/': 'ev',
404
'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner',
405
'http://freshmeat.net/rss/fm/': 'fm',
406
'http://xmlns.com/foaf/0.1/': 'foaf',
407
'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo',
408
'http://postneo.com/icbm/': 'icbm',
409
'http://purl.org/rss/1.0/modules/image/': 'image',
410
'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes',
411
'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes',
412
'http://purl.org/rss/1.0/modules/link/': 'l',
413
'http://search.yahoo.com/mrss': 'media',
414
'http://search.yahoo.com/mrss/': 'media',
415
'http://docs.yahoo.com/mediaModule': 'media',
416
'http://tools.search.yahoo.com/mrss/': 'media',
417
'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
418
'http://prismstandard.org/namespaces/1.2/basic/': 'prism',
419
'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf',
420
'http://www.w3.org/2000/01/rdf-schema#': 'rdfs',
421
'http://purl.org/rss/1.0/modules/reference/': 'ref',
422
'http://purl.org/rss/1.0/modules/richequiv/': 'reqv',
423
'http://purl.org/rss/1.0/modules/search/': 'search',
424
'http://purl.org/rss/1.0/modules/slash/': 'slash',
425
'http://schemas.xmlsoap.org/soap/envelope/': 'soap',
426
'http://purl.org/rss/1.0/modules/servicestatus/': 'ss',
427
'http://hacks.benhammersley.com/rss/streaming/': 'str',
428
'http://purl.org/rss/1.0/modules/subscription/': 'sub',
429
'http://purl.org/rss/1.0/modules/syndication/': 'sy',
430
'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo',
431
'http://purl.org/rss/1.0/modules/threading/': 'thr',
432
'http://purl.org/rss/1.0/modules/textinput/': 'ti',
433
'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
434
'http://wellformedweb.org/commentAPI/': 'wfw',
435
'http://purl.org/rss/1.0/modules/wiki/': 'wiki',
436
'http://www.w3.org/1999/xhtml': 'xhtml',
437
'http://www.w3.org/XML/1998/namespace': 'xml',
438
'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf',
439
"http://participatoryculture.org/RSSModules/dtv/1.0": 'dtv'
441
_matchnamespaces = {}
443
can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
444
can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
445
can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
446
html_types = ['text/html', 'application/xhtml+xml']
448
def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
449
if _debug: sys.stderr.write('initializing FeedParser\n')
450
if not self._matchnamespaces:
451
for k, v in self.namespaces.items():
452
self._matchnamespaces[k.lower()] = v
453
self.feeddata = FeedParserDict() # feed-level data
454
self.encoding = encoding # character encoding
455
self.entries = [] # list of entry-level data
456
self.version = '' # feed type/version, see SUPPORTED_VERSIONS
457
self.namespacesInUse = {} # dictionary of namespaces defined by the feed
459
# the following are used internally to track state;
460
# this is really out of control and should be refactored
467
self.incontributor = 0
471
self.sourcedata = FeedParserDict()
472
self.contentparams = FeedParserDict()
473
self._summaryKey = None
474
self.namespacemap = {}
475
self.elementstack = []
478
self.baseuri = baseuri or ''
479
self.lang = baselang or None
481
self.feeddata['language'] = baselang
483
def unknown_starttag(self, tag, attrs):
484
if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
486
attrs = [(k.lower(), v) for k, v in attrs]
487
attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
489
# track xml:base and xml:lang
490
attrsD = FeedParserDict(attrs)
491
baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
492
self.baseuri = _urljoin(self.baseuri, baseuri)
493
lang = attrsD.get('xml:lang', attrsD.get('lang'))
495
# xml:lang could be explicitly set to '', we need to capture that
498
# if no xml:lang is specified, use parent lang
501
if tag in ('feed', 'rss', 'rdf:RDF'):
502
self.feeddata['language'] = lang
504
self.basestack.append(self.baseuri)
505
self.langstack.append(lang)
508
for prefix, uri in attrs:
509
if prefix.startswith('xmlns:'):
510
self.trackNamespace(prefix[6:], uri)
511
elif prefix == 'xmlns':
512
self.trackNamespace(None, uri)
514
# track inline content
515
if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
516
# element declared itself as escaped markup, but it isn't really
517
self.contentparams['type'] = 'application/xhtml+xml'
518
if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
519
# Note: probably shouldn't simply recreate localname here, but
520
# our namespace handling isn't actually 100% correct in cases where
521
# the feed redefines the default namespace (which is actually
522
# the usual case for inline content, thanks Sam), so here we
523
# cheat and just reconstruct the element based on localname
524
# because that compensates for the bugs in our namespace handling.
525
# This will horribly munge inline content with non-empty qnames,
526
# but nobody actually does that, so I'm not fixing it.
527
tag = tag.split(':')[-1]
528
return self.handle_data('<%s%s>' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0)
531
if tag.find(':') <> -1:
532
prefix, suffix = tag.split(':', 1)
534
prefix, suffix = '', tag
535
prefix = self.namespacemap.get(prefix, prefix)
537
prefix = prefix + '_'
539
# special hack for better tracking of empty textinput/image elements in illformed feeds
540
if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
542
if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
545
# call special handler (if defined) or default handler
546
methodname = '_start_' + prefix + suffix
548
method = getattr(self, methodname)
549
return method(attrsD)
550
except AttributeError:
551
return self.push(prefix + suffix, 1)
553
def unknown_endtag(self, tag):
554
if _debug: sys.stderr.write('end %s\n' % tag)
556
if tag.find(':') <> -1:
557
prefix, suffix = tag.split(':', 1)
559
prefix, suffix = '', tag
560
prefix = self.namespacemap.get(prefix, prefix)
562
prefix = prefix + '_'
564
# call special handler (if defined) or default handler
565
methodname = '_end_' + prefix + suffix
567
method = getattr(self, methodname)
569
except AttributeError:
570
self.pop(prefix + suffix)
572
# track inline content
573
if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
574
# element declared itself as escaped markup, but it isn't really
575
self.contentparams['type'] = 'application/xhtml+xml'
576
if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
577
tag = tag.split(':')[-1]
578
self.handle_data('</%s>' % tag, escape=0)
580
# track xml:base and xml:lang going out of scope
583
if self.basestack and self.basestack[-1]:
584
self.baseuri = self.basestack[-1]
587
if self.langstack: # and (self.langstack[-1] is not None):
588
self.lang = self.langstack[-1]
590
def handle_charref(self, ref):
591
# called for each character reference, e.g. for ' ', ref will be '160'
592
if not self.elementstack: return
594
if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
601
text = unichr(c).encode('utf-8')
602
self.elementstack[-1][2].append(text)
604
def handle_entityref(self, ref):
605
# called for each entity reference, e.g. for '©', ref will be 'copy'
606
if not self.elementstack: return
607
if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
608
if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
611
# entity resolution graciously donated by Aaron Swartz
613
import htmlentitydefs
614
if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
615
return htmlentitydefs.name2codepoint[k]
616
k = htmlentitydefs.entitydefs[k]
617
if k.startswith('&#') and k.endswith(';'):
618
return int(k[2:-1]) # not in latin-1
621
except KeyError: text = '&%s;' % ref
622
else: text = unichr(name2cp(ref)).encode('utf-8')
623
self.elementstack[-1][2].append(text)
625
def handle_data(self, text, escape=1):
626
# called for each block of plain text, i.e. outside of any tag and
627
# not containing any character or entity references
628
if not self.elementstack: return
629
if escape and self.contentparams.get('type') == 'application/xhtml+xml':
630
text = _xmlescape(text)
631
self.elementstack[-1][2].append(text)
633
def handle_comment(self, text):
634
# called for each comment, e.g. <!-- insert message here -->
637
def handle_pi(self, text):
638
# called for each processing instruction, e.g. <?instruction>
641
def handle_decl(self, text):
644
def parse_declaration(self, i):
645
# override internal declaration handler to handle CDATA blocks
646
if _debug: sys.stderr.write('entering parse_declaration\n')
647
if self.rawdata[i:i+9] == '<![CDATA[':
648
k = self.rawdata.find(']]>', i)
649
if k == -1: k = len(self.rawdata)
650
self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
653
k = self.rawdata.find('>', i)
656
def mapContentType(self, contentType):
657
contentType = contentType.lower()
658
if contentType == 'text':
659
contentType = 'text/plain'
660
elif contentType == 'html':
661
contentType = 'text/html'
662
elif contentType == 'xhtml':
663
contentType = 'application/xhtml+xml'
666
def trackNamespace(self, prefix, uri):
667
loweruri = uri.lower()
668
if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
669
self.version = 'rss090'
670
if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
671
self.version = 'rss10'
672
if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
673
self.version = 'atom10'
674
if loweruri.find('backend.userland.com/rss') <> -1:
675
# match any backend.userland.com namespace
676
uri = 'http://backend.userland.com/rss'
678
if self._matchnamespaces.has_key(loweruri):
679
self.namespacemap[prefix] = self._matchnamespaces[loweruri]
680
self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
682
self.namespacesInUse[prefix or ''] = uri
684
def resolveURI(self, uri):
685
return _urljoin(self.baseuri or '', uri)
687
def decodeEntities(self, element, data):
690
def push(self, element, expectingText):
691
self.elementstack.append([element, expectingText, []])
693
def pop(self, element, stripWhitespace=1):
694
if not self.elementstack: return
695
if self.elementstack[-1][0] != element: return
697
element, expectingText, pieces = self.elementstack.pop()
698
output = ''.join(pieces)
700
output = output.strip()
701
if not expectingText: return output
703
# decode base64 content
704
if base64 and self.contentparams.get('base64', 0):
706
output = base64.decodestring(output)
707
except binascii.Error:
709
except binascii.Incomplete:
712
# resolve relative URIs
713
if (element in self.can_be_relative_uri) and output:
714
output = self.resolveURI(output)
716
# decode entities within embedded markup
717
if not self.contentparams.get('base64', 0):
718
output = self.decodeEntities(element, output)
720
# remove temporary cruft from contentparams
722
del self.contentparams['mode']
726
del self.contentparams['base64']
730
# resolve relative URIs within embedded markup
731
if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
732
if element in self.can_contain_relative_uris:
733
output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
735
# sanitize embedded markup
736
if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
737
if element in self.can_contain_dangerous_markup:
738
output = sanitizeHTML(output, self.encoding)
740
if self.encoding and type(output) != type(u''):
742
output = unicode(output, self.encoding)
743
except (SystemExit, KeyboardInterrupt):
748
# categories/tags/keywords/whatever are handled in _end_category
749
if element == 'category':
752
# store output in appropriate place(s)
753
if self.inentry and not self.insource:
754
if element == 'content':
755
self.entries[-1].setdefault(element, [])
756
contentparams = copy.deepcopy(self.contentparams)
757
contentparams['value'] = output
758
self.entries[-1][element].append(contentparams)
759
elif element == 'link':
760
self.entries[-1][element] = output
762
self.entries[-1]['links'][-1]['href'] = output
764
if element == 'description':
766
self.entries[-1][element] = output
768
contentparams = copy.deepcopy(self.contentparams)
769
contentparams['value'] = output
770
self.entries[-1][element + '_detail'] = contentparams
771
elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage):
772
context = self._getContext()
773
if element == 'description':
775
context[element] = output
776
if element == 'link':
777
context['links'][-1]['href'] = output
779
contentparams = copy.deepcopy(self.contentparams)
780
contentparams['value'] = output
781
context[element + '_detail'] = contentparams
784
def pushContent(self, tag, attrsD, defaultContentType, expectingText):
786
self.contentparams = FeedParserDict({
787
'type': self.mapContentType(attrsD.get('type', defaultContentType)),
788
'language': self.lang,
789
'base': self.baseuri})
790
self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
791
self.push(tag, expectingText)
793
def popContent(self, tag):
794
value = self.pop(tag)
796
self.contentparams.clear()
799
def _mapToStandardPrefix(self, name):
800
colonpos = name.find(':')
802
prefix = name[:colonpos]
803
suffix = name[colonpos+1:]
804
prefix = self.namespacemap.get(prefix, prefix)
805
name = prefix + ':' + suffix
808
def _getAttribute(self, attrsD, name):
809
return attrsD.get(self._mapToStandardPrefix(name))
811
def _isBase64(self, attrsD, contentparams):
812
if attrsD.get('mode', '') == 'base64':
814
# We should never assume text is base64 --NN
817
if self.contentparams['type'].startswith('text/'):
819
if self.contentparams['type'].endswith('+xml'):
821
if self.contentparams['type'].endswith('/xml'):
825
def _itsAnHrefDamnIt(self, attrsD):
826
href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
836
attrsD['href'] = href
839
def _save(self, key, value):
840
context = self._getContext()
841
context.setdefault(key, value)
843
def _start_rss(self, attrsD):
844
versionmap = {'0.91': 'rss091u',
849
attr_version = attrsD.get('version', '')
850
version = versionmap.get(attr_version)
852
self.version = version
853
elif attr_version.startswith('2.'):
854
self.version = 'rss20'
858
def _start_dlhottitles(self, attrsD):
859
self.version = 'hotrss'
861
def _start_channel(self, attrsD):
863
self._cdf_common(attrsD)
864
_start_feedinfo = _start_channel
866
def _cdf_common(self, attrsD):
867
if attrsD.has_key('lastmod'):
868
self._start_modified({})
869
self.elementstack[-1][-1] = attrsD['lastmod']
871
if attrsD.has_key('href'):
873
self.elementstack[-1][-1] = attrsD['href']
876
def _start_feed(self, attrsD):
878
versionmap = {'0.1': 'atom01',
882
attr_version = attrsD.get('version')
883
version = versionmap.get(attr_version)
885
self.version = version
887
self.version = 'atom'
889
def _end_channel(self):
891
_end_feed = _end_channel
893
def _start_image(self, attrsD):
895
self.push('image', 0)
896
context = self._getContext()
897
context.setdefault('image', FeedParserDict())
899
def _end_image(self):
903
def _start_textinput(self, attrsD):
905
self.push('textinput', 0)
906
context = self._getContext()
907
context.setdefault('textinput', FeedParserDict())
908
_start_textInput = _start_textinput
910
def _end_textinput(self):
911
self.pop('textinput')
913
_end_textInput = _end_textinput
915
def _start_author(self, attrsD):
917
self.push('author', 1)
918
_start_managingeditor = _start_author
919
_start_dc_author = _start_author
920
_start_dc_creator = _start_author
921
_start_itunes_author = _start_author
923
def _end_author(self):
926
self._sync_author_detail()
927
_end_managingeditor = _end_author
928
_end_dc_author = _end_author
929
_end_dc_creator = _end_author
930
_end_itunes_author = _end_author
932
def _start_itunes_owner(self, attrsD):
934
self.push('publisher', 0)
936
def _end_itunes_owner(self):
937
self.pop('publisher')
939
self._sync_author_detail('publisher')
941
def _start_contributor(self, attrsD):
942
self.incontributor = 1
943
context = self._getContext()
944
context.setdefault('contributors', [])
945
context['contributors'].append(FeedParserDict())
946
self.push('contributor', 0)
948
def _end_contributor(self):
949
self.pop('contributor')
950
self.incontributor = 0
952
def _start_dc_contributor(self, attrsD):
953
self.incontributor = 1
954
context = self._getContext()
955
context.setdefault('contributors', [])
956
context['contributors'].append(FeedParserDict())
959
def _end_dc_contributor(self):
961
self.incontributor = 0
963
def _start_name(self, attrsD):
965
_start_itunes_name = _start_name
968
value = self.pop('name')
970
self._save_author('name', value, 'publisher')
972
self._save_author('name', value)
973
elif self.incontributor:
974
self._save_contributor('name', value)
975
elif self.intextinput:
976
context = self._getContext()
977
context['textinput']['name'] = value
978
_end_itunes_name = _end_name
980
def _start_width(self, attrsD):
981
self.push('width', 0)
983
def _end_width(self):
984
value = self.pop('width')
987
except (SystemExit, KeyboardInterrupt):
992
context = self._getContext()
993
context['image']['width'] = value
995
def _start_height(self, attrsD):
996
self.push('height', 0)
998
def _end_height(self):
999
value = self.pop('height')
1002
except (SystemExit, KeyboardInterrupt):
1007
context = self._getContext()
1008
context['image']['height'] = value
1010
def _start_url(self, attrsD):
1011
self.push('href', 1)
1012
_start_homepage = _start_url
1013
_start_uri = _start_url
1016
value = self.pop('href')
1018
self._save_author('href', value)
1019
elif self.incontributor:
1020
self._save_contributor('href', value)
1022
context = self._getContext()
1023
context['image']['href'] = value
1024
elif self.intextinput:
1025
context = self._getContext()
1026
context['textinput']['link'] = value
1027
_end_homepage = _end_url
1030
def _start_email(self, attrsD):
1031
self.push('email', 0)
1032
_start_itunes_email = _start_email
1034
def _end_email(self):
1035
value = self.pop('email')
1036
if self.inpublisher:
1037
self._save_author('email', value, 'publisher')
1039
self._save_author('email', value)
1040
elif self.incontributor:
1041
self._save_contributor('email', value)
1042
_end_itunes_email = _end_email
1044
def _getContext(self):
1046
context = self.sourcedata
1048
context = self.entries[-1]
1050
context = self.feeddata
1053
def _save_author(self, key, value, prefix='author'):
1054
context = self._getContext()
1055
context.setdefault(prefix + '_detail', FeedParserDict())
1056
context[prefix + '_detail'][key] = value
1057
self._sync_author_detail()
1059
def _save_contributor(self, key, value):
1060
context = self._getContext()
1061
context.setdefault('contributors', [FeedParserDict()])
1062
context['contributors'][-1][key] = value
1064
def _sync_author_detail(self, key='author'):
1065
context = self._getContext()
1066
detail = context.get('%s_detail' % key)
1068
name = detail.get('name')
1069
email = detail.get('email')
1071
context[key] = '%s (%s)' % (name, email)
1075
context[key] = email
1077
author = context.get(key)
1078
if not author: return
1079
emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author)
1080
if not emailmatch: return
1081
email = emailmatch.group(0)
1082
# probably a better way to do the following, but it passes all the tests
1083
author = author.replace(email, '')
1084
author = author.replace('()', '')
1085
author = author.strip()
1086
if author and (author[0] == '('):
1088
if author and (author[-1] == ')'):
1089
author = author[:-1]
1090
author = author.strip()
1091
context.setdefault('%s_detail' % key, FeedParserDict())
1092
context['%s_detail' % key]['name'] = author
1093
context['%s_detail' % key]['email'] = email
1095
def _start_subtitle(self, attrsD):
1096
self.pushContent('subtitle', attrsD, 'text/plain', 1)
1097
_start_tagline = _start_subtitle
1098
_start_itunes_subtitle = _start_subtitle
1100
def _end_subtitle(self):
1101
self.popContent('subtitle')
1102
_end_tagline = _end_subtitle
1103
_end_itunes_subtitle = _end_subtitle
1105
def _start_rights(self, attrsD):
1106
self.pushContent('rights', attrsD, 'text/plain', 1)
1107
_start_dc_rights = _start_rights
1108
_start_copyright = _start_rights
1110
def _end_rights(self):
1111
self.popContent('rights')
1112
_end_dc_rights = _end_rights
1113
_end_copyright = _end_rights
1115
def _start_item(self, attrsD):
1116
self.entries.append(FeedParserDict())
1117
self.push('item', 0)
1120
id = self._getAttribute(attrsD, 'rdf:about')
1122
context = self._getContext()
1124
self._cdf_common(attrsD)
1125
_start_entry = _start_item
1126
_start_product = _start_item
1128
def _end_item(self):
1131
_end_entry = _end_item
1133
def _start_dc_language(self, attrsD):
1134
self.push('language', 1)
1135
_start_language = _start_dc_language
1137
def _end_dc_language(self):
1138
self.lang = self.pop('language')
1139
_end_language = _end_dc_language
1141
def _start_dc_publisher(self, attrsD):
1142
self.push('publisher', 1)
1143
_start_webmaster = _start_dc_publisher
1145
def _end_dc_publisher(self):
1146
self.pop('publisher')
1147
self._sync_author_detail('publisher')
1148
_end_webmaster = _end_dc_publisher
1150
def _start_published(self, attrsD):
1151
self.push('published', 1)
1152
_start_dcterms_issued = _start_published
1153
_start_issued = _start_published
1155
def _end_published(self):
1156
value = self.pop('published')
1157
self._save('published_parsed', _parse_date(value))
1158
_end_dcterms_issued = _end_published
1159
_end_issued = _end_published
1161
def _start_updated(self, attrsD):
1162
self.push('updated', 1)
1163
_start_modified = _start_updated
1164
_start_dcterms_modified = _start_updated
1165
_start_pubdate = _start_updated
1166
_start_dc_date = _start_updated
1168
def _end_updated(self):
1169
value = self.pop('updated')
1170
parsed_value = _parse_date(value)
1171
self._save('updated_parsed', parsed_value)
1172
_end_modified = _end_updated
1173
_end_dcterms_modified = _end_updated
1174
_end_pubdate = _end_updated
1175
_end_dc_date = _end_updated
1177
def _start_created(self, attrsD):
1178
self.push('created', 1)
1179
_start_dcterms_created = _start_created
1181
def _end_created(self):
1182
value = self.pop('created')
1183
self._save('created_parsed', _parse_date(value))
1184
_end_dcterms_created = _end_created
1186
def _start_expirationdate(self, attrsD):
1187
self.push('expired', 1)
1189
def _end_expirationdate(self):
1190
self._save('expired_parsed', _parse_date(self.pop('expired')))
1192
def _start_cc_license(self, attrsD):
1193
self.push('license', 1)
1194
value = self._getAttribute(attrsD, 'rdf:resource')
1196
self.elementstack[-1][2].append(value)
1199
def _start_creativecommons_license(self, attrsD):
1200
self.push('license', 1)
1202
def _end_creativecommons_license(self):
1205
def _addTag(self, term, scheme, label):
1206
context = self._getContext()
1207
tags = context.setdefault('tags', [])
1208
if (not term) and (not scheme) and (not label): return
1209
value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
1210
if value not in tags:
1211
tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label}))
1213
def _start_category(self, attrsD):
1214
if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
1215
term = attrsD.get('term')
1216
scheme = attrsD.get('scheme', attrsD.get('domain'))
1217
label = attrsD.get('label')
1218
self._addTag(term, scheme, label)
1219
self.push('category', 1)
1220
_start_dc_subject = _start_category
1221
_start_keywords = _start_category
1222
_start_media_category = _start_category
1224
def _end_itunes_keywords(self):
1225
for term in self.pop('itunes_keywords').split():
1226
self._addTag(term, 'http://www.itunes.com/', None)
1228
def _start_itunes_category(self, attrsD):
1229
self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
1230
self.push('category', 1)
1232
def _end_category(self):
1233
value = self.pop('category')
1234
if not value: return
1235
context = self._getContext()
1236
tags = context['tags']
1237
if value and len(tags) and not tags[-1]['term']:
1238
tags[-1]['term'] = value
1240
self._addTag(value, None, None)
1241
_end_dc_subject = _end_category
1242
_end_keywords = _end_category
1243
_end_itunes_category = _end_category
1244
_end_media_category = _end_category
1246
def _start_cloud(self, attrsD):
1247
self._getContext()['cloud'] = FeedParserDict(attrsD)
1249
def _start_link(self, attrsD):
1250
attrsD.setdefault('rel', 'alternate')
1251
attrsD.setdefault('type', 'text/html')
1252
attrsD = self._itsAnHrefDamnIt(attrsD)
1253
if attrsD.has_key('href'):
1254
attrsD['href'] = self.resolveURI(attrsD['href'])
1255
expectingText = self.infeed or self.inentry or self.insource
1256
context = self._getContext()
1257
context.setdefault('links', [])
1258
context['links'].append(FeedParserDict(attrsD))
1259
if attrsD['rel'] == 'enclosure':
1260
self._start_enclosure(attrsD)
1261
if attrsD.has_key('href'):
1263
if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
1264
context['link'] = attrsD['href']
1266
self.push('link', expectingText)
1267
_start_producturl = _start_link
1269
def _end_link(self):
1270
value = self.pop('link')
1271
context = self._getContext()
1272
if self.intextinput:
1273
context['textinput']['link'] = value
1275
context['image']['link'] = value
1276
_end_producturl = _end_link
1278
def _start_guid(self, attrsD):
1279
self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
1282
def _end_guid(self):
1283
value = self.pop('id')
1284
self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
1286
# guid acts as link, but only if 'ispermalink' is not present or is 'true',
1287
# and only if the item doesn't already have a link element
1288
self._save('link', value)
1290
def _start_title(self, attrsD):
1291
self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1292
_start_dc_title = _start_title
1293
_start_media_title = _start_title
1295
def _end_title(self):
1296
value = self.popContent('title')
1297
context = self._getContext()
1298
if self.intextinput:
1299
context['textinput']['title'] = value
1301
context['image']['title'] = value
1302
_end_dc_title = _end_title
1303
_end_media_title = _end_title
1305
def _start_description(self, attrsD):
1306
context = self._getContext()
1307
if context.has_key('summary'):
1308
self._summaryKey = 'content'
1309
self._start_content(attrsD)
1311
self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
1313
def _start_abstract(self, attrsD):
1314
self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1316
def _end_description(self):
1317
if self._summaryKey == 'content':
1320
value = self.popContent('description')
1321
context = self._getContext()
1322
if self.intextinput:
1323
context['textinput']['description'] = value
1325
context['image']['description'] = value
1326
self._summaryKey = None
1327
_end_abstract = _end_description
1329
def _start_info(self, attrsD):
1330
self.pushContent('info', attrsD, 'text/plain', 1)
1331
_start_feedburner_browserfriendly = _start_info
1333
def _end_info(self):
1334
self.popContent('info')
1335
_end_feedburner_browserfriendly = _end_info
1337
def _start_generator(self, attrsD):
1339
attrsD = self._itsAnHrefDamnIt(attrsD)
1340
if attrsD.has_key('href'):
1341
attrsD['href'] = self.resolveURI(attrsD['href'])
1342
self._getContext()['generator_detail'] = FeedParserDict(attrsD)
1343
self.push('generator', 1)
1345
def _end_generator(self):
1346
value = self.pop('generator')
1347
context = self._getContext()
1348
if context.has_key('generator_detail'):
1349
context['generator_detail']['name'] = value
1351
def _start_admin_generatoragent(self, attrsD):
1352
self.push('generator', 1)
1353
value = self._getAttribute(attrsD, 'rdf:resource')
1355
self.elementstack[-1][2].append(value)
1356
self.pop('generator')
1357
self._getContext()['generator_detail'] = FeedParserDict({'href': value})
1359
def _start_admin_errorreportsto(self, attrsD):
1360
self.push('errorreportsto', 1)
1361
value = self._getAttribute(attrsD, 'rdf:resource')
1363
self.elementstack[-1][2].append(value)
1364
self.pop('errorreportsto')
1366
def _start_summary(self, attrsD):
1367
context = self._getContext()
1368
if context.has_key('summary'):
1369
self._summaryKey = 'content'
1370
self._start_content(attrsD)
1372
self._summaryKey = 'summary'
1373
self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)
1374
_start_itunes_summary = _start_summary
1376
def _end_summary(self):
1377
if self._summaryKey == 'content':
1380
self.popContent(self._summaryKey or 'summary')
1381
self._summaryKey = None
1382
_end_itunes_summary = _end_summary
1384
def _start_enclosure(self, attrsD):
1385
self.inenclosure += 1
1386
attrsD = self._itsAnHrefDamnIt(attrsD)
1387
self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD))
1388
_start_media_content = _start_enclosure
1390
def _end_enclosure(self):
1391
self.inenclosure -= 1
1392
_end_media_content = _end_enclosure
1394
def _start_media_thumbnail(self,attrsD):
1395
self.push('media:thumbnail',1)
1397
if self.inenclosure:
1398
self.entries[-1]['enclosures'][-1]['thumbnail']=FeedParserDict(attrsD)
1400
self.entries[-1]['thumbnail'] = FeedParserDict(attrsD)
1402
def _end_media_thumbnail(self):
1403
self.pop('media:thumbnail')
1405
def _start_media_text(self,attrsD):
1406
self.push('media:text',1)
1408
def _end_media_text(self):
1409
value = self.pop('media:text')
1411
if self.inenclosure:
1412
self.entries[-1]['enclosures'][-1]['text'] = value
1414
self.entries[-1]['text'] = value
1416
def _start_media_people(self,attrsD):
1417
self.push('media:people',1)
1419
self.peoplerole = attrsD['role']
1420
except (SystemExit, KeyboardInterrupt):
1423
self.peoplerole = 'unknown'
1425
def _end_media_people(self):
1426
value = self.pop('media:people').split('|')
1428
if self.inenclosure:
1429
self.entries[-1]['enclosures'][-1].setdefault('roles', {})
1430
self.entries[-1]['enclosures'][-1].roles[self.peoplerole]=value
1432
self.entries[-1].setdefault('roles', {})
1433
self.entries[-1].roles[self.peoplerole]=value
1435
def _start_dtv_startnback(self,attrsD):
1436
self.push('dtv:startnback',1)
1438
def _end_dtv_startnback(self):
1439
self.feeddata['startnback'] = self.pop('dtv:startnback')
1441
def _start_dtv_librarylink(self,attrsD):
1442
self.push('dtv:librarylink',1)
1444
def _end_dtv_librarylink(self):
1445
self.feeddata['librarylink'] = self.pop('dtv:librarylink')
1447
def _start_dtv_releasedate(self,attrsD):
1448
self.push('dtv:releasedate',1)
1450
def _end_dtv_releasedate(self):
1451
value = self.pop('dtv:releasedate')
1453
if self.inenclosure:
1454
self.entries[-1]['enclosures'][-1]['releasedate'] = value
1455
self.entries[-1]['enclosures'][-1]['releasedate_parsed'] = _parse_date(value)
1457
self.entries[-1]['releasedate'] = value
1458
self.entries[-1]['releasedate_parsed'] = _parse_date(value)
1460
def _start_dtv_paymentlink(self,attrsD):
1462
self.contentparams['mode'] = 'xml'
1463
self.contentparams['type'] = 'application/xhtml+xml'
1464
self.push('dtv:paymentlink',1)
1466
if attrsD.has_key('url'):
1467
if self.inenclosure:
1468
self.entries[-1]['enclosures'][-1]['payment_url'] = attrsD['url']
1470
self.entries[-1]['payment_url'] = attrsD['url']
1472
def _end_dtv_paymentlink(self):
1473
value = sanitizeHTML(self.pop('dtv:paymentlink'),self.encoding)
1475
self.contentparams.clear()
1477
if self.inenclosure:
1478
self.entries[-1]['enclosures'][-1]['payment_html'] = value
1480
self.entries[-1]['payment_html'] = value
1482
def _start_source(self, attrsD):
1485
def _end_source(self):
1487
self._getContext()['source'] = copy.deepcopy(self.sourcedata)
1488
self.sourcedata.clear()
1490
def _start_content(self, attrsD):
1491
self.pushContent('content', attrsD, 'text/plain', 1)
1492
src = attrsD.get('src')
1494
self.contentparams['src'] = src
1495
self.push('content', 1)
1497
def _start_prodlink(self, attrsD):
1498
self.pushContent('content', attrsD, 'text/html', 1)
1500
def _start_body(self, attrsD):
1501
self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
1502
_start_xhtml_body = _start_body
1504
def _start_content_encoded(self, attrsD):
1505
self.pushContent('content', attrsD, 'text/html', 1)
1506
_start_fullitem = _start_content_encoded
1508
def _end_content(self):
1509
copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
1510
value = self.popContent('content')
1511
if copyToDescription:
1512
self._save('description', value)
1513
_end_body = _end_content
1514
_end_xhtml_body = _end_content
1515
_end_content_encoded = _end_content
1516
_end_fullitem = _end_content
1517
_end_prodlink = _end_content
1519
def _start_itunes_image(self, attrsD):
1520
self.push('itunes_image', 0)
1521
self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
1523
def _start_itunes_link(self, attrsD):
1524
self.push('itunes_link', 0)
1525
self._getContext()['link'] = FeedParserDict({'href': attrsD.get('href')})
1527
def _end_itunes_block(self):
1528
value = self.pop('itunes_block', 0)
1529
self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
1531
def _end_itunes_explicit(self):
1532
value = self.pop('itunes_explicit', 0)
1533
self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0
1536
class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1537
def __init__(self, baseuri, baselang, encoding):
1538
if _debug: sys.stderr.write('trying StrictFeedParser\n')
1539
xml.sax.handler.ContentHandler.__init__(self)
1540
_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1544
def startPrefixMapping(self, prefix, uri):
1545
self.trackNamespace(prefix, uri)
1547
def startElementNS(self, name, qname, attrs):
1548
namespace, localname = name
1549
lowernamespace = str(namespace or '').lower()
1550
if lowernamespace.find('backend.userland.com/rss') <> -1:
1551
# match any backend.userland.com namespace
1552
namespace = 'http://backend.userland.com/rss'
1553
lowernamespace = namespace
1554
if qname and qname.find(':') > 0:
1555
givenprefix = qname.split(':')[0]
1558
prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1559
if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
1560
raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
1562
localname = prefix + ':' + localname
1563
localname = str(localname).lower()
1564
if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
1566
# qname implementation is horribly broken in Python 2.1 (it
1567
# doesn't report any), and slightly broken in Python 2.2 (it
1568
# doesn't report the xml: namespace). So we match up namespaces
1569
# with a known list first, and then possibly override them with
1570
# the qnames the SAX parser gives us (if indeed it gives us any
1571
# at all). Thanks to MatejC for helping me test this and
1572
# tirelessly telling me that it didn't work yet.
1574
for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
1575
lowernamespace = (namespace or '').lower()
1576
prefix = self._matchnamespaces.get(lowernamespace, '')
1578
attrlocalname = prefix + ':' + attrlocalname
1579
attrsD[str(attrlocalname).lower()] = attrvalue
1580
for qname in attrs.getQNames():
1581
attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1582
self.unknown_starttag(localname, attrsD.items())
1584
def characters(self, text):
1585
self.handle_data(text)
1587
def endElementNS(self, name, qname):
1588
namespace, localname = name
1589
lowernamespace = str(namespace or '').lower()
1590
if qname and qname.find(':') > 0:
1591
givenprefix = qname.split(':')[0]
1594
prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1596
localname = prefix + ':' + localname
1597
localname = str(localname).lower()
1598
self.unknown_endtag(localname)
1600
def error(self, exc):
1604
def fatalError(self, exc):
1608
class _BaseHTMLProcessor(sgmllib.SGMLParser):
1609
elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
1610
'img', 'input', 'isindex', 'link', 'meta', 'param']
1612
def __init__(self, encoding):
1613
self.encoding = encoding
1614
if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
1615
sgmllib.SGMLParser.__init__(self)
1619
sgmllib.SGMLParser.reset(self)
1621
def _shorttag_replace(self, match):
1622
tag = match.group(1)
1623
if tag in self.elements_no_end_tag:
1624
return '<' + tag + ' />'
1626
return '<' + tag + '></' + tag + '>'
1628
def feed(self, data):
1629
data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)
1630
#data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
1631
data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data)
1632
data = data.replace(''', "'")
1633
data = data.replace('"', '"')
1634
if self.encoding and type(data) == type(u''):
1635
data = data.encode(self.encoding)
1636
sgmllib.SGMLParser.feed(self, data)
1638
def normalize_attrs(self, attrs):
1639
# utility method to be called by descendants
1640
attrs = [(k.lower(), v) for k, v in attrs]
1641
attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
1644
def parse_starttag(self, i):
1645
retval = sgmllib.SGMLParser.parse_starttag(self, i)
1647
if self.get_starttag_text()[-2:] == "/>":
1648
self.finish_endtag(self.lasttag)
1649
except (SystemExit, KeyboardInterrupt):
1655
def unknown_starttag(self, tag, attrs):
1656
# called for each start tag
1657
# attrs is a list of (attr, value) tuples
1658
# e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
1659
if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
1661
# thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
1662
for key, value in attrs:
1663
if type(value) != type(u''):
1664
value = unicode(value, self.encoding)
1665
uattrs.append((unicode(key, self.encoding), value))
1666
strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
1667
if tag in self.elements_no_end_tag:
1668
self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
1670
self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
1672
def unknown_endtag(self, tag):
1673
# called for each end tag, e.g. for </pre>, tag will be 'pre'
1674
# Reconstruct the original end tag.
1675
if tag not in self.elements_no_end_tag:
1676
self.pieces.append("</%(tag)s>" % locals())
1678
def handle_charref(self, ref):
1679
# called for each character reference, e.g. for ' ', ref will be '160'
1680
# Reconstruct the original character reference.
1681
self.pieces.append('&#%(ref)s;' % locals())
1683
def handle_entityref(self, ref):
1684
# called for each entity reference, e.g. for '©', ref will be 'copy'
1685
# Reconstruct the original entity reference.
1686
self.pieces.append('&%(ref)s;' % locals())
1688
def handle_data(self, text):
1689
# called for each block of plain text, i.e. outside of any tag and
1690
# not containing any character or entity references
1691
# Store the original text verbatim.
1692
if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
1693
self.pieces.append(text)
1695
def handle_comment(self, text):
1696
# called for each HTML comment, e.g. <!-- insert Javascript code here -->
1697
# Reconstruct the original comment.
1698
self.pieces.append('<!--%(text)s-->' % locals())
1700
def handle_pi(self, text):
1701
# called for each processing instruction, e.g. <?instruction>
1702
# Reconstruct original processing instruction.
1703
self.pieces.append('<?%(text)s>' % locals())
1705
def handle_decl(self, text):
1706
# called for the DOCTYPE, if present, e.g.
1707
# <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1708
# "http://www.w3.org/TR/html4/loose.dtd">
1709
# Reconstruct original DOCTYPE
1710
self.pieces.append('<!%(text)s>' % locals())
1712
_new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
1713
def _scan_name(self, i, declstartpos):
1714
rawdata = self.rawdata
1718
m = self._new_declname_match(rawdata, i)
1722
if (i + len(s)) == n:
1723
return None, -1 # end of buffer
1724
return name.lower(), m.end()
1726
self.handle_data(rawdata)
1727
# self.updatepos(declstartpos, i)
1731
'''Return processed HTML as a single string'''
1732
return ''.join([str(p) for p in self.pieces])
1734
class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
1735
def __init__(self, baseuri, baselang, encoding):
1736
sgmllib.SGMLParser.__init__(self)
1737
_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1739
def decodeEntities(self, element, data):
1740
data = data.replace('<', '<')
1741
data = data.replace('<', '<')
1742
data = data.replace('>', '>')
1743
data = data.replace('>', '>')
1744
data = data.replace('&', '&')
1745
data = data.replace('&', '&')
1746
data = data.replace('"', '"')
1747
data = data.replace('"', '"')
1748
data = data.replace(''', ''')
1749
data = data.replace(''', ''')
1750
if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
1751
data = data.replace('<', '<')
1752
data = data.replace('>', '>')
1753
data = data.replace('&', '&')
1754
data = data.replace('"', '"')
1755
data = data.replace(''', "'")
1758
class _RelativeURIResolver(_BaseHTMLProcessor):
1759
relative_uris = [('a', 'href'),
1760
('applet', 'codebase'),
1762
('blockquote', 'cite'),
1763
('body', 'background'),
1766
('frame', 'longdesc'),
1768
('iframe', 'longdesc'),
1770
('head', 'profile'),
1771
('img', 'longdesc'),
1775
('input', 'usemap'),
1778
('object', 'classid'),
1779
('object', 'codebase'),
1781
('object', 'usemap'),
1785
def __init__(self, baseuri, encoding):
1786
_BaseHTMLProcessor.__init__(self, encoding)
1787
self.baseuri = baseuri
1789
def resolveURI(self, uri):
1790
return _urljoin(self.baseuri, uri)
1792
def unknown_starttag(self, tag, attrs):
1793
attrs = self.normalize_attrs(attrs)
1794
attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
1795
_BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1797
def _resolveRelativeURIs(htmlSource, baseURI, encoding):
1798
if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
1799
p = _RelativeURIResolver(baseURI, encoding)
1803
class _HTMLSanitizer(_BaseHTMLProcessor):
1804
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
1805
'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
1806
'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
1807
'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
1808
'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
1809
'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
1810
'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
1811
'thead', 'tr', 'tt', 'u', 'ul', 'var']
1813
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
1814
'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
1815
'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
1816
'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
1817
'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
1818
'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
1819
'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
1820
'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
1821
'span', 'src', 'start', 'summary', 'tabindex', 'title', 'type',
1822
'usemap', 'valign', 'value', 'vspace', 'width']
1824
unacceptable_elements_with_end_tag = ['script', 'applet']
1827
_BaseHTMLProcessor.reset(self)
1828
self.unacceptablestack = 0
1830
def unknown_starttag(self, tag, attrs):
1831
if not tag in self.acceptable_elements:
1832
if tag in self.unacceptable_elements_with_end_tag:
1833
self.unacceptablestack += 1
1835
attrs = self.normalize_attrs(attrs)
1836
attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
1837
_BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1839
def unknown_endtag(self, tag):
1840
if not tag in self.acceptable_elements:
1841
if tag in self.unacceptable_elements_with_end_tag:
1842
self.unacceptablestack -= 1
1844
_BaseHTMLProcessor.unknown_endtag(self, tag)
1846
def handle_pi(self, text):
1849
def handle_decl(self, text):
1852
def handle_data(self, text):
1853
if not self.unacceptablestack:
1854
_BaseHTMLProcessor.handle_data(self, text)
1856
def sanitizeHTML(htmlSource, encoding):
1857
p = _HTMLSanitizer(encoding)
1861
# loop through list of preferred Tidy interfaces looking for one that's installed,
1862
# then set up a common _tidy function to wrap the interface-specific API.
1864
for tidy_interface in PREFERRED_TIDY_INTERFACES:
1866
if tidy_interface == "uTidy":
1867
from tidy import parseString as _utidy
1868
def _tidy(data, **kwargs):
1869
return str(_utidy(data, **kwargs))
1871
elif tidy_interface == "mxTidy":
1872
from mx.Tidy import Tidy as _mxtidy
1873
def _tidy(data, **kwargs):
1874
nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
1877
except (SystemExit, KeyboardInterrupt):
1882
utf8 = type(data) == type(u'')
1884
data = data.encode('utf-8')
1885
data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
1887
data = unicode(data, 'utf-8')
1888
if data.count('<body'):
1889
data = data.split('<body', 1)[1]
1891
data = data.split('>', 1)[1]
1892
if data.count('</body'):
1893
data = data.split('</body', 1)[0]
1894
data = data.strip().replace('\r\n', '\n')
1897
class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
1898
def http_error_default(self, req, fp, code, msg, headers):
1899
if ((code / 100) == 3) and (code != 304):
1900
return self.http_error_302(req, fp, code, msg, headers)
1901
infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1902
infourl.status = code
1905
def http_error_302(self, req, fp, code, msg, headers):
1906
if headers.dict.has_key('location'):
1907
infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
1909
infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1910
if not hasattr(infourl, 'status'):
1911
infourl.status = code
1914
def http_error_301(self, req, fp, code, msg, headers):
1915
if headers.dict.has_key('location'):
1916
infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
1918
infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1919
if not hasattr(infourl, 'status'):
1920
infourl.status = code
1923
http_error_300 = http_error_302
1924
http_error_303 = http_error_302
1925
http_error_307 = http_error_302
1927
def http_error_401(self, req, fp, code, msg, headers):
1929
# - server requires digest auth, AND
1930
# - we tried (unsuccessfully) with basic auth, AND
1931
# - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)
1932
# If all conditions hold, parse authentication information
1933
# out of the Authorization header we sent the first time
1934
# (for the username and password) and the WWW-Authenticate
1935
# header the server sent back (for the realm) and retry
1936
# the request with the appropriate digest auth headers instead.
1937
# This evil genius hack has been brought to you by Aaron Swartz.
1938
host = urlparse.urlparse(req.get_full_url())[1]
1940
assert sys.version.split()[0] >= '2.3.3'
1941
assert base64 != None
1942
user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')
1943
realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
1944
self.add_password(realm, host, user, passw)
1945
retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
1946
self.reset_retry_count()
1948
except (SystemExit, KeyboardInterrupt):
1951
return self.http_error_default(req, fp, code, msg, headers)
1953
def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):
1954
"""URL, filename, or string --> stream
1956
This function lets you define parsers that take any input source
1957
(URL, pathname to local or network file, or actual data as a string)
1958
and deal with it in a uniform manner. Returned object is guaranteed
1959
to have all the basic stdio read methods (read, readline, readlines).
1960
Just .close() the object when you're done with it.
1962
If the etag argument is supplied, it will be used as the value of an
1963
If-None-Match request header.
1965
If the modified argument is supplied, it must be a tuple of 9 integers
1966
as returned by gmtime() in the standard Python time module. This MUST
1967
be in GMT (Greenwich Mean Time). The formatted date/time will be used
1968
as the value of an If-Modified-Since request header.
1970
If the agent argument is supplied, it will be used as the value of a
1971
User-Agent request header.
1973
If the referrer argument is supplied, it will be used as the value of a
1974
Referer[sic] request header.
1976
If handlers is supplied, it is a list of handlers used to build a
1980
if hasattr(url_file_stream_or_string, 'read'):
1981
return url_file_stream_or_string
1983
if url_file_stream_or_string == '-':
1986
if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
1989
# test for inline user:password for basic auth
1992
urltype, rest = urllib.splittype(url_file_stream_or_string)
1993
realhost, rest = urllib.splithost(rest)
1995
user_passwd, realhost = urllib.splituser(realhost)
1997
url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
1998
auth = base64.encodestring(user_passwd).strip()
1999
# try to open with urllib2 (to use optional headers)
2000
request = urllib2.Request(url_file_stream_or_string)
2001
request.add_header('User-Agent', agent)
2003
request.add_header('If-None-Match', etag)
2005
# format into an RFC 1123-compliant timestamp. We can't use
2006
# time.strftime() since the %a and %b directives can be affected
2007
# by the current locale, but RFC 2616 states that dates must be
2009
short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
2010
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
2011
request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
2013
request.add_header('Referer', referrer)
2015
request.add_header('Accept-encoding', 'gzip, deflate')
2017
request.add_header('Accept-encoding', 'gzip')
2019
request.add_header('Accept-encoding', 'deflate')
2021
request.add_header('Accept-encoding', '')
2023
request.add_header('Authorization', 'Basic %s' % auth)
2025
request.add_header('Accept', ACCEPT_HEADER)
2026
request.add_header('A-IM', 'feed') # RFC 3229 support
2027
opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))
2028
opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
2030
return opener.open(request)
2032
opener.close() # JohnD
2034
# try to open with native open function (if url_file_stream_or_string is a filename)
2036
return open(url_file_stream_or_string)
2037
except (SystemExit, KeyboardInterrupt):
2042
# treat url_file_stream_or_string as string
2043
return _StringIO(str(url_file_stream_or_string))
2046
def registerDateHandler(func):
2047
'''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
2048
_date_handlers.insert(0, func)
2050
# ISO-8601 date parsing routines written by Fazal Majid.
2051
# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
2052
# parser is beyond the scope of feedparser and would be a worthwhile addition
2053
# to the Python library.
2054
# A single regular expression cannot parse ISO 8601 date formats into groups
2055
# as the standard is highly irregular (for instance is 030104 2003-01-04 or
2056
# 0301-04-01), so we use templates instead.
2057
# Please note the order in templates is significant because we need a
2059
_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
2060
'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
2061
'-YY-?MM', '-OOO', '-YY',
2067
'YYYY', r'(?P<year>\d{4})').replace(
2068
'YY', r'(?P<year>\d\d)').replace(
2069
'MM', r'(?P<month>[01]\d)').replace(
2070
'DD', r'(?P<day>[0123]\d)').replace(
2071
'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
2072
'CC', r'(?P<century>\d\d$)')
2073
+ r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
2074
+ r'(:(?P<second>\d{2}))?'
2075
+ r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
2076
for tmpl in _iso8601_tmpl]
2078
_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
2080
def _parse_date_iso8601(dateString):
2081
'''Parse a variety of ISO-8601-compatible formats like 20040105'''
2083
for _iso8601_match in _iso8601_matches:
2084
m = _iso8601_match(dateString)
2087
if m.span() == (0, 0): return
2088
params = m.groupdict()
2089
ordinal = params.get('ordinal', 0)
2091
ordinal = int(ordinal)
2094
year = params.get('year', '--')
2095
if not year or year == '--':
2096
year = time.gmtime()[0]
2097
elif len(year) == 2:
2098
# ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
2099
year = 100 * int(time.gmtime()[0] / 100) + int(year)
2102
month = params.get('month', '-')
2103
if not month or month == '-':
2104
# ordinals are NOT normalized by mktime, we simulate them
2105
# by setting month=1, day=ordinal
2109
month = time.gmtime()[1]
2111
day = params.get('day', 0)
2116
elif params.get('century', 0) or \
2117
params.get('year', 0) or params.get('month', 0):
2120
day = time.gmtime()[2]
2123
# special case of the century - is the first year of the 21st century
2124
# 2000 or 2001 ? The debate goes on...
2125
if 'century' in params.keys():
2126
year = (int(params['century']) - 1) * 100 + 1
2127
# in ISO 8601 most fields are optional
2128
for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
2129
if not params.get(field, None):
2131
hour = int(params.get('hour', 0))
2132
minute = int(params.get('minute', 0))
2133
second = int(params.get('second', 0))
2134
# weekday is normalized by mktime(), we can ignore it
2136
# daylight savings is complex, but not needed for feedparser's purposes
2137
# as time zones, if specified, include mention of whether it is active
2138
# (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
2139
# and most implementations have DST bugs
2140
daylight_savings_flag = 0
2141
tm = [year, month, day, hour, minute, second, weekday,
2142
ordinal, daylight_savings_flag]
2143
# ISO 8601 time zone adjustments
2144
tz = params.get('tz')
2145
if tz and tz != 'Z':
2147
tm[3] += int(params.get('tzhour', 0))
2148
tm[4] += int(params.get('tzmin', 0))
2150
tm[3] -= int(params.get('tzhour', 0))
2151
tm[4] -= int(params.get('tzmin', 0))
2154
# Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
2155
# which is guaranteed to normalize d/m/y/h/m/s.
2156
# Many implementations have bugs, but we'll pretend they don't.
2157
return time.localtime(time.mktime(tm))
2158
registerDateHandler(_parse_date_iso8601)
2160
# 8-bit date handling routines written by ytrewq1.
2161
_korean_year = u'\ub144' # b3e2 in euc-kr
2162
_korean_month = u'\uc6d4' # bff9 in euc-kr
2163
_korean_day = u'\uc77c' # c0cf in euc-kr
2164
_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr
2165
_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
2167
_korean_onblog_date_re = \
2168
re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
2169
(_korean_year, _korean_month, _korean_day))
2170
_korean_nate_date_re = \
2171
re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
2172
(_korean_am, _korean_pm))
2173
def _parse_date_onblog(dateString):
2174
'''Parse a string according to the OnBlog 8-bit date format'''
2175
m = _korean_onblog_date_re.match(dateString)
2177
w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
2178
{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
2179
'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
2180
'zonediff': '+09:00'}
2181
if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
2182
return _parse_date_w3dtf(w3dtfdate)
2183
registerDateHandler(_parse_date_onblog)
2185
def _parse_date_nate(dateString):
2186
'''Parse a string according to the Nate 8-bit date format'''
2187
m = _korean_nate_date_re.match(dateString)
2189
hour = int(m.group(5))
2191
if (ampm == _korean_pm):
2196
w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
2197
{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
2198
'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
2199
'zonediff': '+09:00'}
2200
if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
2201
return _parse_date_w3dtf(w3dtfdate)
2202
registerDateHandler(_parse_date_nate)
2205
re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
2206
def _parse_date_mssql(dateString):
2207
'''Parse a string according to the MS SQL date format'''
2208
m = _mssql_date_re.match(dateString)
2210
w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
2211
{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
2212
'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
2213
'zonediff': '+09:00'}
2214
if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
2215
return _parse_date_w3dtf(w3dtfdate)
2216
registerDateHandler(_parse_date_mssql)
2218
# Unicode strings for Greek date strings
2221
u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7
2222
u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7
2223
u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7
2224
u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7
2225
u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7
2226
u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7
2227
u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7
2228
u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7
2229
u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
2230
u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7
2231
u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
2232
u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7
2233
u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7
2234
u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7
2235
u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7
2236
u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7
2237
u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7
2238
u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7
2239
u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7
2244
u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
2245
u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
2246
u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
2247
u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
2248
u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
2249
u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
2250
u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
2253
_greek_date_format_re = \
2254
re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
2256
def _parse_date_greek(dateString):
2257
'''Parse a string according to a Greek 8-bit date format.'''
2258
m = _greek_date_format_re.match(dateString)
2261
wday = _greek_wdays[m.group(1)]
2262
month = _greek_months[m.group(3)]
2263
except (SystemExit, KeyboardInterrupt):
2267
rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
2268
{'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
2269
'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
2270
'zonediff': m.group(8)}
2271
if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
2272
return _parse_date_rfc822(rfc822date)
2273
registerDateHandler(_parse_date_greek)
2275
# Unicode strings for Hungarian date strings
2276
_hungarian_months = \
2278
u'janu\u00e1r': u'01', # e1 in iso-8859-2
2279
u'febru\u00e1ri': u'02', # e1 in iso-8859-2
2280
u'm\u00e1rcius': u'03', # e1 in iso-8859-2
2281
u'\u00e1prilis': u'04', # e1 in iso-8859-2
2282
u'm\u00e1ujus': u'05', # e1 in iso-8859-2
2283
u'j\u00fanius': u'06', # fa in iso-8859-2
2284
u'j\u00falius': u'07', # fa in iso-8859-2
2285
u'augusztus': u'08',
2286
u'szeptember': u'09',
2287
u'okt\u00f3ber': u'10', # f3 in iso-8859-2
2292
_hungarian_date_format_re = \
2293
re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
2295
def _parse_date_hungarian(dateString):
2296
'''Parse a string according to a Hungarian 8-bit date format.'''
2297
m = _hungarian_date_format_re.match(dateString)
2300
month = _hungarian_months[m.group(2)]
2307
except (SystemExit, KeyboardInterrupt):
2311
w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
2312
{'year': m.group(1), 'month': month, 'day': day,\
2313
'hour': hour, 'minute': m.group(5),\
2314
'zonediff': m.group(6)}
2315
if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
2316
return _parse_date_w3dtf(w3dtfdate)
2317
registerDateHandler(_parse_date_hungarian)
2319
# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
2320
# Drake and licensed under the Python license. Removed all range checking
2321
# for month, day, hour, minute, and second, since mktime will normalize
2323
def _parse_date_w3dtf(dateString):
2324
def __extract_date(m):
2325
year = int(m.group('year'))
2327
year = 100 * int(time.gmtime()[0] / 100) + int(year)
2330
julian = m.group('julian')
2332
julian = int(julian)
2333
month = julian / 30 + 1
2334
day = julian % 30 + 1
2336
while jday != julian:
2337
t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
2338
jday = time.gmtime(t)[-2]
2339
diff = abs(jday - julian)
2351
return year, month, day
2352
month = m.group('month')
2358
day = m.group('day')
2363
return year, month, day
2365
def __extract_time(m):
2368
hours = m.group('hours')
2372
minutes = int(m.group('minutes'))
2373
seconds = m.group('seconds')
2375
seconds = int(seconds)
2378
return hours, minutes, seconds
2380
def __extract_tzd(m):
2381
'''Return the Time Zone Designator as an offset in seconds from UTC.'''
2384
tzd = m.group('tzd')
2389
hours = int(m.group('tzdhours'))
2390
minutes = m.group('tzdminutes')
2392
minutes = int(minutes)
2395
offset = (hours*60 + minutes) * 60
2400
__date_re = ('(?P<year>\d\d\d\d)'
2402
'(?:(?P<julian>\d\d\d)'
2403
'|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?')
2404
__tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'
2405
__tzd_rx = re.compile(__tzd_re)
2406
__time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
2407
'(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?'
2409
__datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
2410
__datetime_rx = re.compile(__datetime_re)
2411
m = __datetime_rx.match(dateString)
2412
if (m is None) or (m.group() != dateString): return
2413
gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
2414
if gmt[0] == 0: return
2415
return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
2416
registerDateHandler(_parse_date_w3dtf)
2418
def _parse_date_rfc822(dateString):
2419
'''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
2420
data = dateString.split()
2421
if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
2427
data[3:] = [s[:i], s[i+1:]]
2430
dateString = " ".join(data)
2432
dateString += ' 00:00:00 GMT'
2433
tm = rfc822.parsedate_tz(dateString)
2435
return time.gmtime(rfc822.mktime_tz(tm))
2436
# rfc822.py defines several time zones, but we define some extra ones.
2437
# 'ET' is equivalent to 'EST', etc.
2438
_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
2439
rfc822._timezones.update(_additional_timezones)
2440
registerDateHandler(_parse_date_rfc822)
2442
def _parse_date(dateString):
2443
'''Parses a variety of date formats into a 9-tuple in GMT'''
2444
for handler in _date_handlers:
2446
date9tuple = handler(dateString)
2447
if not date9tuple: continue
2448
if len(date9tuple) != 9:
2449
if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
2451
map(int, date9tuple)
2453
except Exception, e:
2454
if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
2458
def _getCharacterEncoding(http_headers, xml_data):
2459
'''Get the character encoding of the XML document
2461
http_headers is a dictionary
2462
xml_data is a raw string (not Unicode)
2464
This is so much trickier than it sounds, it's not even funny.
2465
According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
2466
is application/xml, application/*+xml,
2467
application/xml-external-parsed-entity, or application/xml-dtd,
2468
the encoding given in the charset parameter of the HTTP Content-Type
2469
takes precedence over the encoding given in the XML prefix within the
2470
document, and defaults to 'utf-8' if neither are specified. But, if
2471
the HTTP Content-Type is text/xml, text/*+xml, or
2472
text/xml-external-parsed-entity, the encoding given in the XML prefix
2473
within the document is ALWAYS IGNORED and only the encoding given in
2474
the charset parameter of the HTTP Content-Type header should be
2475
respected, and it defaults to 'us-ascii' if not specified.
2477
Furthermore, discussion on the atom-syntax mailing list with the
2478
author of RFC 3023 leads me to the conclusion that any document
2479
served with a Content-Type of text/* and no charset parameter
2480
must be treated as us-ascii. (We now do this.) And also that it
2481
must always be flagged as non-well-formed. (We now do this too.)
2483
If Content-Type is unspecified (input was local file or non-HTTP source)
2484
or unrecognized (server just got it totally wrong), then go by the
2485
encoding given in the XML prefix of the document and default to
2486
'iso-8859-1' as per the HTTP specification (RFC 2616).
2488
Then, assuming we didn't find a character encoding in the HTTP headers
2489
(and the HTTP Content-type allowed us to look in the body), we need
2490
to sniff the first few bytes of the XML data and try to determine
2491
whether the encoding is ASCII-compatible. Section F of the XML
2492
specification shows the way here:
2493
http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
2495
If the sniffed encoding is not ASCII-compatible, we need to make it
2496
ASCII compatible so that we can sniff further into the XML declaration
2497
to find the encoding attribute, which will tell us the true encoding.
2499
Of course, none of this guarantees that we will be able to parse the
2500
feed in the declared character encoding (assuming it was declared
2501
correctly, which many are not). CJKCodecs and iconv_codec help a lot;
2502
you should definitely install them if you can.
2503
http://cjkpython.i18n.org/
2506
def _parseHTTPContentType(content_type):
2507
'''takes HTTP Content-Type header and returns (content type, charset)
2509
If no charset is specified, returns (content type, '')
2510
If no content type is specified, returns ('', '')
2511
Both return parameters are guaranteed to be lowercase strings
2513
content_type = content_type or ''
2514
content_type, params = cgi.parse_header(content_type)
2515
return content_type, params.get('charset', '').replace("'", '')
2517
sniffed_xml_encoding = ''
2520
http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))
2521
# Must sniff for non-ASCII-compatible character encodings before
2522
# searching for XML declaration. This heuristic is defined in
2523
# section F of the XML specification:
2524
# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
2526
if xml_data[:4] == '\x4c\x6f\xa7\x94':
2528
xml_data = _ebcdic_to_ascii(xml_data)
2529
elif xml_data[:4] == '\x00\x3c\x00\x3f':
2531
sniffed_xml_encoding = 'utf-16be'
2532
xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
2533
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):
2535
sniffed_xml_encoding = 'utf-16be'
2536
xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
2537
elif xml_data[:4] == '\x3c\x00\x3f\x00':
2539
sniffed_xml_encoding = 'utf-16le'
2540
xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
2541
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):
2543
sniffed_xml_encoding = 'utf-16le'
2544
xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
2545
elif xml_data[:4] == '\x00\x00\x00\x3c':
2547
sniffed_xml_encoding = 'utf-32be'
2548
xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
2549
elif xml_data[:4] == '\x3c\x00\x00\x00':
2551
sniffed_xml_encoding = 'utf-32le'
2552
xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
2553
elif xml_data[:4] == '\x00\x00\xfe\xff':
2555
sniffed_xml_encoding = 'utf-32be'
2556
xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
2557
elif xml_data[:4] == '\xff\xfe\x00\x00':
2559
sniffed_xml_encoding = 'utf-32le'
2560
xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
2561
elif xml_data[:3] == '\xef\xbb\xbf':
2563
sniffed_xml_encoding = 'utf-8'
2564
xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
2568
xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
2569
except (SystemExit, KeyboardInterrupt):
2572
xml_encoding_match = None
2573
if xml_encoding_match:
2574
xml_encoding = xml_encoding_match.groups()[0].lower()
2575
if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
2576
xml_encoding = sniffed_xml_encoding
2577
acceptable_content_type = 0
2578
application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
2579
text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
2580
if (http_content_type in application_content_types) or \
2581
(http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
2582
acceptable_content_type = 1
2583
true_encoding = http_encoding or xml_encoding or 'utf-8'
2584
elif (http_content_type in text_content_types) or \
2585
(http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
2586
acceptable_content_type = 1
2587
true_encoding = http_encoding or 'us-ascii'
2588
elif http_content_type.startswith('text/'):
2589
true_encoding = http_encoding or 'us-ascii'
2590
elif http_headers and (not http_headers.has_key('content-type')):
2591
true_encoding = xml_encoding or 'iso-8859-1'
2593
true_encoding = xml_encoding or 'utf-8'
2594
return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
2596
def _toUTF8(data, encoding):
2597
'''Changes an XML data stream on the fly to specify a new encoding
2599
data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
2600
encoding is a string recognized by encodings.aliases
2602
if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
2603
# strip Byte Order Mark (if present)
2604
if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):
2606
sys.stderr.write('stripping BOM\n')
2607
if encoding != 'utf-16be':
2608
sys.stderr.write('trying utf-16be instead\n')
2609
encoding = 'utf-16be'
2611
elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
2613
sys.stderr.write('stripping BOM\n')
2614
if encoding != 'utf-16le':
2615
sys.stderr.write('trying utf-16le instead\n')
2616
encoding = 'utf-16le'
2618
elif data[:3] == '\xef\xbb\xbf':
2620
sys.stderr.write('stripping BOM\n')
2621
if encoding != 'utf-8':
2622
sys.stderr.write('trying utf-8 instead\n')
2625
elif data[:4] == '\x00\x00\xfe\xff':
2627
sys.stderr.write('stripping BOM\n')
2628
if encoding != 'utf-32be':
2629
sys.stderr.write('trying utf-32be instead\n')
2630
encoding = 'utf-32be'
2632
elif data[:4] == '\xff\xfe\x00\x00':
2634
sys.stderr.write('stripping BOM\n')
2635
if encoding != 'utf-32le':
2636
sys.stderr.write('trying utf-32le instead\n')
2637
encoding = 'utf-32le'
2639
newdata = unicode(data, encoding)
2640
if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
2641
declmatch = re.compile('^<\?xml[^>]*?>')
2642
newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
2643
if declmatch.search(newdata):
2644
newdata = declmatch.sub(newdecl, newdata)
2646
newdata = newdecl + u'\n' + newdata
2647
return newdata.encode('utf-8')
2649
def _stripDoctype(data):
2650
'''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
2652
rss_version may be 'rss091n' or None
2653
stripped_data is the same XML document, minus the DOCTYPE
2655
entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE)
2656
data = entity_pattern.sub('', data)
2657
doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)
2658
doctype_results = doctype_pattern.findall(data)
2659
doctype = doctype_results and doctype_results[0] or ''
2660
if doctype.lower().count('netscape'):
2664
data = doctype_pattern.sub('', data)
2665
return version, data
2667
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
2668
'''Parse a feed from a URL, file, stream, or string'''
2669
result = FeedParserDict()
2670
result['feed'] = FeedParserDict()
2671
result['entries'] = []
2674
if type(handlers) == types.InstanceType:
2675
handlers = [handlers]
2677
f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
2679
except Exception, e:
2681
result['bozo_exception'] = e
2685
# if feed is gzip-compressed, decompress it
2686
if f and data and hasattr(f, 'headers'):
2687
if gzip and f.headers.get('content-encoding', '') == 'gzip':
2689
data = gzip.GzipFile(fileobj=_StringIO(data)).read()
2690
except Exception, e:
2691
# Some feeds claim to be gzipped but they're not, so
2692
# we get garbage. Ideally, we should re-request the
2693
# feed without the 'Accept-encoding: gzip' header,
2696
result['bozo_exception'] = e
2698
elif zlib and f.headers.get('content-encoding', '') == 'deflate':
2700
data = zlib.decompress(data, -zlib.MAX_WBITS)
2701
except Exception, e:
2703
result['bozo_exception'] = e
2707
if hasattr(f, 'info'):
2709
result['etag'] = info.getheader('ETag')
2710
last_modified = info.getheader('Last-Modified')
2712
result['modified'] = _parse_date(last_modified)
2713
if hasattr(f, 'url'):
2714
result['href'] = f.url
2715
result['status'] = 200
2716
if hasattr(f, 'status'):
2717
result['status'] = f.status
2718
if hasattr(f, 'headers'):
2719
result['headers'] = f.headers.dict
2720
if hasattr(f, 'close'):
2723
# there are four encodings to keep track of:
2724
# - http_encoding is the encoding declared in the Content-Type HTTP header
2725
# - xml_encoding is the encoding declared in the <?xml declaration
2726
# - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
2727
# - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
2728
http_headers = result.get('headers', {})
2729
result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
2730
_getCharacterEncoding(http_headers, data)
2731
if http_headers and (not acceptable_content_type):
2732
if http_headers.has_key('content-type'):
2733
bozo_message = '%s is not an XML media type' % http_headers['content-type']
2735
bozo_message = 'no Content-type specified'
2737
result['bozo_exception'] = NonXMLContentType(bozo_message)
2739
result['version'], data = _stripDoctype(data)
2741
baseuri = http_headers.get('content-location', result.get('href'))
2742
baselang = http_headers.get('content-language', None)
2744
# if server sent 304, we're done
2745
if result.get('status', 0) == 304:
2746
result['version'] = ''
2747
result['debug_message'] = 'The feed has not changed since you last checked, ' + \
2748
'so the server sent no data. This is a feature, not a bug!'
2751
# if there was a problem downloading, we're done
2755
# determine character encoding
2756
use_strict_parser = 0
2758
tried_encodings = []
2759
# try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
2760
for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
2761
if not proposed_encoding: continue
2762
if proposed_encoding in tried_encodings: continue
2763
tried_encodings.append(proposed_encoding)
2765
data = _toUTF8(data, proposed_encoding)
2766
known_encoding = use_strict_parser = 1
2768
except (SystemExit, KeyboardInterrupt):
2772
# if no luck and we have auto-detection library, try that
2773
if (not known_encoding) and chardet:
2775
proposed_encoding = chardet.detect(data)['encoding']
2776
if proposed_encoding and (proposed_encoding not in tried_encodings):
2777
tried_encodings.append(proposed_encoding)
2778
data = _toUTF8(data, proposed_encoding)
2779
known_encoding = use_strict_parser = 1
2780
except (SystemExit, KeyboardInterrupt):
2784
# if still no luck and we haven't tried utf-8 yet, try that
2785
if (not known_encoding) and ('utf-8' not in tried_encodings):
2787
proposed_encoding = 'utf-8'
2788
tried_encodings.append(proposed_encoding)
2789
data = _toUTF8(data, proposed_encoding)
2790
known_encoding = use_strict_parser = 1
2791
except (SystemExit, KeyboardInterrupt):
2795
# if still no luck and we haven't tried windows-1252 yet, try that
2796
if (not known_encoding) and ('windows-1252' not in tried_encodings):
2798
proposed_encoding = 'windows-1252'
2799
tried_encodings.append(proposed_encoding)
2800
data = _toUTF8(data, proposed_encoding)
2801
known_encoding = use_strict_parser = 1
2802
except (SystemExit, KeyboardInterrupt):
2806
# if still no luck, give up
2807
if not known_encoding:
2809
result['bozo_exception'] = CharacterEncodingUnknown( \
2810
'document encoding unknown, I tried ' + \
2811
'%s, %s, utf-8, and windows-1252 but nothing worked' % \
2812
(result['encoding'], xml_encoding))
2813
result['encoding'] = ''
2814
elif proposed_encoding != result['encoding']:
2816
result['bozo_exception'] = CharacterEncodingOverride( \
2817
'documented declared as %s, but parsed as %s' % \
2818
(result['encoding'], proposed_encoding))
2819
result['encoding'] = proposed_encoding
2821
if not _XML_AVAILABLE:
2822
use_strict_parser = 0
2823
if use_strict_parser:
2824
# initialize the SAX parser
2825
feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
2826
saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
2827
saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
2828
saxparser.setContentHandler(feedparser)
2829
saxparser.setErrorHandler(feedparser)
2830
source = xml.sax.xmlreader.InputSource()
2831
source.setByteStream(_StringIO(data))
2832
if hasattr(saxparser, '_ns_stack'):
2833
# work around bug in built-in SAX parser (doesn't recognize xml: namespace)
2834
# PyXML doesn't have this problem, and it doesn't have _ns_stack either
2835
saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
2837
saxparser.parse(source)
2838
except Exception, e:
2841
traceback.print_stack()
2842
traceback.print_exc()
2843
sys.stderr.write('xml parsing failed\n')
2845
result['bozo_exception'] = feedparser.exc or e
2846
use_strict_parser = 0
2847
if not use_strict_parser:
2848
feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '')
2849
feedparser.feed(data)
2850
result['feed'] = feedparser.feeddata
2851
result['entries'] = feedparser.entries
2852
result['version'] = result['version'] or feedparser.version
2853
result['namespaces'] = feedparser.namespacesInUse
2856
if __name__ == '__main__':
2857
if not sys.argv[1:]:
2862
zopeCompatibilityHack()
2863
from pprint import pprint
2872
#1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
2873
# added Simon Fell's test suite
2874
#1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections
2876
# JD - use inchannel to watch out for image and textinput elements which can
2877
# also contain title, link, and description elements
2878
# JD - check for isPermaLink='false' attribute on guid elements
2879
# JD - replaced openAnything with open_resource supporting ETag and
2880
# If-Modified-Since request headers
2881
# JD - parse now accepts etag, modified, agent, and referrer optional
2883
# JD - modified parse to return a dictionary instead of a tuple so that any
2884
# etag or modified information can be returned and cached by the caller
2885
#2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything
2886
# because of etag/modified, return the old etag/modified to the caller to
2887
# indicate why nothing is being returned
2888
#2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its
2889
# useless. Fixes the problem JD was addressing by adding it.
2890
#2.1 - 11/14/2002 - MAP - added gzip support
2891
#2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
2892
# start_admingeneratoragent is an example of how to handle elements with
2893
# only attributes, no content.
2894
#2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
2895
# also, make sure we send the User-Agent even if urllib2 isn't available.
2896
# Match any variation of backend.userland.com/rss namespace.
2897
#2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
2898
#2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
2899
# snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed
2901
#2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
2902
# removed unnecessary urllib code -- urllib2 should always be available anyway;
2903
# return actual url, status, and full HTTP headers (as result['url'],
2904
# result['status'], and result['headers']) if parsing a remote feed over HTTP --
2905
# this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;
2906
# added the latest namespace-of-the-week for RSS 2.0
2907
#2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
2908
# User-Agent (otherwise urllib2 sends two, which confuses some servers)
2909
#2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
2910
# inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds
2911
#2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
2912
# textInput, and also to return the character encoding (if specified)
2913
#2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking
2914
# nested divs within content (JohnD); fixed missing sys import (JohanS);
2915
# fixed regular expression to capture XML character encoding (Andrei);
2916
# added support for Atom 0.3-style links; fixed bug with textInput tracking;
2917
# added support for cloud (MartijnP); added support for multiple
2918
# category/dc:subject (MartijnP); normalize content model: 'description' gets
2919
# description (which can come from description, summary, or full content if no
2920
# description), 'content' gets dict of base/language/type/value (which can come
2921
# from content:encoded, xhtml:body, content, or fullitem);
2922
# fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang
2923
# tracking; fixed bug tracking unknown tags; fixed bug tracking content when
2924
# <content> element is not in default namespace (like Pocketsoap feed);
2925
# resolve relative URLs in link, guid, docs, url, comments, wfw:comment,
2926
# wfw:commentRSS; resolve relative URLs within embedded HTML markup in
2927
# description, xhtml:body, content, content:encoded, title, subtitle,
2928
# summary, info, tagline, and copyright; added support for pingback and
2929
# trackback namespaces
2930
#2.7 - 1/5/2004 - MAP - really added support for trackback and pingback
2931
# namespaces, as opposed to 2.6 when I said I did but didn't really;
2932
# sanitize HTML markup within some elements; added mxTidy support (if
2933
# installed) to tidy HTML markup within some elements; fixed indentation
2934
# bug in _parse_date (FazalM); use socket.setdefaulttimeout if available
2935
# (FazalM); universal date parsing and normalization (FazalM): 'created', modified',
2936
# 'issued' are parsed into 9-tuple date format and stored in 'created_parsed',
2937
# 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'
2938
# and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa
2939
#2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory
2940
# leak not closing url opener (JohnD); added dc:publisher support (MarekK);
2941
# added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)
2942
#2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in
2943
# encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);
2944
# fixed relative URI processing for guid (skadz); added ICBM support; added
2946
#2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
2947
# blogspot.com sites); added _debug variable
2948
#2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
2949
#3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);
2950
# added several new supported namespaces; fixed bug tracking naked markup in
2951
# description; added support for enclosure; added support for source; re-added
2952
# support for cloud which got dropped somehow; added support for expirationDate
2953
#3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking
2954
# xml:base URI, one for documents that don't define one explicitly and one for
2955
# documents that define an outer and an inner xml:base that goes out of scope
2956
# before the end of the document
2957
#3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level
2958
#3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version']
2959
# will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;
2960
# added support for creativeCommons:license and cc:license; added support for
2961
# full Atom content model in title, tagline, info, copyright, summary; fixed bug
2962
# with gzip encoding (not always telling server we support it when we do)
2963
#3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail
2964
# (dictionary of 'name', 'url', 'email'); map author to author_detail if author
2965
# contains name + email address
2966
#3.0b8 - 1/28/2004 - MAP - added support for contributor
2967
#3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added
2968
# support for summary
2969
#3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from
2971
#3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain
2972
# dangerous markup; fiddled with decodeEntities (not right); liberalized
2973
# date parsing even further
2974
#3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);
2975
# added support to Atom 0.2 subtitle; added support for Atom content model
2976
# in copyright; better sanitizing of dangerous HTML elements with end tags
2977
# (script, frameset)
2978
#3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,
2979
# etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />)
2980
#3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under
2982
#3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;
2983
# fixed bug capturing author and contributor URL; fixed bug resolving relative
2984
# links in author and contributor URL; fixed bug resolvin relative links in
2985
# generator URL; added support for recognizing RSS 1.0; passed Simon Fell's
2986
# namespace tests, and included them permanently in the test suite with his
2987
# permission; fixed namespace handling under Python 2.1
2988
#3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)
2989
#3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023
2990
#3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);
2991
# use libxml2 (if available)
2992
#3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author
2993
# name was in parentheses; removed ultra-problematic mxTidy support; patch to
2994
# workaround crash in PyXML/expat when encountering invalid entities
2995
# (MarkMoraes); support for textinput/textInput
2996
#3.0b20 - 4/7/2004 - MAP - added CDF support
2997
#3.0b21 - 4/14/2004 - MAP - added Hot RSS support
2998
#3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in
2999
# results dict; changed results dict to allow getting values with results.key
3000
# as well as results[key]; work around embedded illformed HTML with half
3001
# a DOCTYPE; work around malformed Content-Type header; if character encoding
3002
# is wrong, try several common ones before falling back to regexes (if this
3003
# works, bozo_exception is set to CharacterEncodingOverride); fixed character
3004
# encoding issues in BaseHTMLProcessor by tracking encoding and converting
3005
# from Unicode to raw strings before feeding data to sgmllib.SGMLParser;
3006
# convert each value in results to Unicode (if possible), even if using
3007
# regex-based parsing
3008
#3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain
3009
# high-bit characters in attributes in embedded HTML in description (thanks
3010
# Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in
3011
# FeedParserDict; tweaked FeedParserDict.has_key to return True if asking
3012
# about a mapped key
3013
#3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and
3014
# results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could
3015
# cause the same encoding to be tried twice (even if it failed the first time);
3016
# fixed DOCTYPE stripping when DOCTYPE contained entity declarations;
3017
# better textinput and image tracking in illformed RSS 1.0 feeds
3018
#3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed
3019
# my blink tag tests
3020
#3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that
3021
# failed to parse utf-16 encoded feeds; made source into a FeedParserDict;
3022
# duplicate admin:generatorAgent/@rdf:resource in generator_detail.url;
3023
# added support for image; refactored parse() fallback logic to try other
3024
# encodings if SAX parsing fails (previously it would only try other encodings
3025
# if re-encoding failed); remove unichr madness in normalize_attrs now that
3026
# we're properly tracking encoding in and out of BaseHTMLProcessor; set
3027
# feed.language from root-level xml:lang; set entry.id from rdf:about;
3028
# send Accept header
3029
#3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between
3030
# iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are
3031
# windows-1252); fixed regression that could cause the same encoding to be
3032
# tried twice (even if it failed the first time)
3033
#3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types;
3034
# recover from malformed content-type header parameter with no equals sign
3035
# ('text/xml; charset:iso-8859-1')
3036
#3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities
3037
# to Unicode equivalents in illformed feeds (aaronsw); added and
3038
# passed tests for converting character entities to Unicode equivalents
3039
# in illformed feeds (aaronsw); test for valid parsers when setting
3040
# XML_AVAILABLE; make version and encoding available when server returns
3041
# a 304; add handlers parameter to pass arbitrary urllib2 handlers (like
3042
# digest auth or proxy support); add code to parse username/password
3043
# out of url and send as basic authentication; expose downloading-related
3044
# exceptions in bozo_exception (aaronsw); added __contains__ method to
3045
# FeedParserDict (aaronsw); added publisher_detail (aaronsw)
3046
#3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always
3047
# convert feed to UTF-8 before passing to XML parser; completely revamped
3048
# logic for determining character encoding and attempting XML parsing
3049
# (much faster); increased default timeout to 20 seconds; test for presence
3050
# of Location header on redirects; added tests for many alternate character
3051
# encodings; support various EBCDIC encodings; support UTF-16BE and
3052
# UTF16-LE with or without a BOM; support UTF-8 with a BOM; support
3053
# UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no
3054
# XML parsers are available; added support for 'Content-encoding: deflate';
3055
# send blank 'Accept-encoding: ' header if neither gzip nor zlib modules
3057
#3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure
3058
# problem tracking xml:base and xml:lang if element declares it, child
3059
# doesn't, first grandchild redeclares it, and second grandchild doesn't;
3060
# refactored date parsing; defined public registerDateHandler so callers
3061
# can add support for additional date formats at runtime; added support
3062
# for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added
3063
# zopeCompatibilityHack() which turns FeedParserDict into a regular
3064
# dictionary, required for Zope compatibility, and also makes command-
3065
# line debugging easier because pprint module formats real dictionaries
3066
# better than dictionary-like objects; added NonXMLContentType exception,
3067
# which is stored in bozo_exception when a feed is served with a non-XML
3068
# media type such as 'text/plain'; respect Content-Language as default
3069
# language if not xml:lang is present; cloud dict is now FeedParserDict;
3070
# generator dict is now FeedParserDict; better tracking of xml:lang,
3071
# including support for xml:lang='' to unset the current language;
3072
# recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default
3073
# namespace; don't overwrite final status on redirects (scenarios:
3074
# redirecting to a URL that returns 304, redirecting to a URL that
3075
# redirects to another URL with a different type of redirect); add
3076
# support for HTTP 303 redirects
3077
#4.0 - MAP - support for relative URIs in xml:base attribute; fixed
3078
# encoding issue with mxTidy (phopkins); preliminary support for RFC 3229;
3079
# support for Atom 1.0; support for iTunes extensions; new 'tags' for
3080
# categories/keywords/etc. as array of dict
3081
# {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0
3082
# terminology; parse RFC 822-style dates with no time; lots of other
3084
#4.1 - MAP - removed socket timeout; added support for chardet library