2
"""Universal feed parser
4
Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
6
Visit http://feedparser.org/ for the latest version
7
Visit http://feedparser.org/docs/ for the latest documentation
9
Required: Python 2.1 or later
10
Recommended: Python 2.3 or later
11
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
14
__version__ = "4.1"# + "$Revision: 1.92 $"[11:15] + "-cvs"
15
__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
17
Redistribution and use in source and binary forms, with or without modification,
18
are permitted provided that the following conditions are met:
20
* Redistributions of source code must retain the above copyright notice,
21
this list of conditions and the following disclaimer.
22
* Redistributions in binary form must reproduce the above copyright notice,
23
this list of conditions and the following disclaimer in the documentation
24
and/or other materials provided with the distribution.
26
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
27
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36
POSSIBILITY OF SUCH DAMAGE."""
37
__author__ = "Mark Pilgrim <http://diveintomark.org/>"
38
__contributors__ = ["Jason Diamond <http://injektilo.org/>",
39
"John Beimler <http://john.beimler.org/>",
40
"Fazal Majid <http://www.majid.info/mylos/weblog/>",
41
"Aaron Swartz <http://aaronsw.com/>",
42
"Kevin Marks <http://epeus.blogspot.com/>"]
45
# HTTP "User-Agent" header to send to servers when downloading feeds.
46
# If you are embedding feedparser in a larger application, you should
47
# change this to your application name and URL.
48
USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__
50
# HTTP "Accept" header to send to servers when downloading feeds. If you don't
51
# want to send an Accept header, set this to None.
52
ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1"
54
# List of preferred XML parsers, by SAX driver name. These will be tried first,
55
# but if they're not installed, Python will keep searching through its own list
56
# of pre-installed parsers until it finds one that supports everything we need.
57
PREFERRED_XML_PARSERS = ["drv_libxml2"]
59
# If you want feedparser to automatically run HTML markup through HTML Tidy, set
60
# this to 1. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html>
61
# or utidylib <http://utidylib.berlios.de/>.
64
# List of Python interfaces for HTML Tidy, in order of preference. Only useful
66
PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"]
68
# ---------- required modules (should come with any Python distribution) ----------
69
import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2
71
from cStringIO import StringIO as _StringIO
73
from StringIO import StringIO as _StringIO
75
# ---------- optional modules (feedparser will work without these, but with reduced functionality) ----------
77
# gzip is included with most Python distributions, but may not be available if you compiled your own
87
# If a real XML parser is available, feedparser will attempt to use it. feedparser has
88
# been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the
89
# Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some
90
# versions of FreeBSD), feedparser will quietly fall back on regex-based parsing.
93
xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers
94
from xml.sax.saxutils import escape as _xmlescape
99
data = data.replace('&', '&')
100
data = data.replace('>', '>')
101
data = data.replace('<', '<')
104
# base64 support for Atom feeds that contain embedded binary data
106
import base64, binascii
108
base64 = binascii = None
110
# cjkcodecs and iconv_codec provide support for more character encodings.
111
# Both are available from http://cjkpython.i18n.org/
113
import cjkcodecs.aliases
121
# chardet library auto-detects character encodings
122
# Download from http://chardet.feedparser.org/
126
import chardet.constants
127
chardet.constants._debug = 1
131
# ---------- don't touch these ----------
132
class ThingsNobodyCaresAboutButMe(Exception): pass
133
class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass
134
class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass
135
class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass
136
class UndeclaredNamespace(Exception): pass
138
sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
139
sgmllib.special = re.compile('<!')
140
sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]')
142
SUPPORTED_VERSIONS = {'': 'unknown',
143
'rss090': 'RSS 0.90',
144
'rss091n': 'RSS 0.91 (Netscape)',
145
'rss091u': 'RSS 0.91 (Userland)',
146
'rss092': 'RSS 0.92',
147
'rss093': 'RSS 0.93',
148
'rss094': 'RSS 0.94',
151
'rss': 'RSS (unknown version)',
152
'atom01': 'Atom 0.1',
153
'atom02': 'Atom 0.2',
154
'atom03': 'Atom 0.3',
155
'atom10': 'Atom 1.0',
156
'atom': 'Atom (unknown version)',
164
# Python 2.1 does not have dict
165
from UserDict import UserDict
172
class FeedParserDict(UserDict):
173
keymap = {'channel': 'feed',
177
'date_parsed': 'updated_parsed',
178
'description': ['subtitle', 'summary'],
180
'modified': 'updated',
181
'modified_parsed': 'updated_parsed',
182
'issued': 'published',
183
'issued_parsed': 'published_parsed',
184
'copyright': 'rights',
185
'copyright_detail': 'rights_detail',
186
'tagline': 'subtitle',
187
'tagline_detail': 'subtitle_detail'}
188
def __getitem__(self, key):
189
if key == 'category':
190
return UserDict.__getitem__(self, 'tags')[0]['term']
191
if key == 'categories':
192
return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')]
193
realkey = self.keymap.get(key, key)
194
if type(realkey) == types.ListType:
196
if UserDict.has_key(self, k):
197
return UserDict.__getitem__(self, k)
198
if UserDict.has_key(self, key):
199
return UserDict.__getitem__(self, key)
200
return UserDict.__getitem__(self, realkey)
202
def __setitem__(self, key, value):
203
for k in self.keymap.keys():
206
if type(key) == types.ListType:
208
return UserDict.__setitem__(self, key, value)
210
def get(self, key, default=None):
211
if self.has_key(key):
216
def setdefault(self, key, value):
217
if not self.has_key(key):
221
def has_key(self, key):
223
return hasattr(self, key) or UserDict.has_key(self, key)
224
except AttributeError:
227
def __getattr__(self, key):
229
return self.__dict__[key]
233
assert not key.startswith('_')
234
return self.__getitem__(key)
236
raise AttributeError, "object has no attribute '%s'" % key
238
def __setattr__(self, key, value):
239
if key.startswith('_') or key == 'data':
240
self.__dict__[key] = value
242
return self.__setitem__(key, value)
244
def __contains__(self, key):
245
return self.has_key(key)
247
def zopeCompatibilityHack():
248
global FeedParserDict
250
def FeedParserDict(aDict=None):
256
_ebcdic_to_ascii_map = None
257
def _ebcdic_to_ascii(s):
258
global _ebcdic_to_ascii_map
259
if not _ebcdic_to_ascii_map:
261
0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
262
16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
263
128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
264
144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
265
32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
266
38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
267
45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
268
186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
269
195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201,
270
202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208,
271
209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215,
272
216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,
273
123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237,
274
125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243,
275
92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249,
276
48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255
279
_ebcdic_to_ascii_map = string.maketrans( \
280
''.join(map(chr, range(256))), ''.join(map(chr, emap)))
281
return s.translate(_ebcdic_to_ascii_map)
283
_urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)')
284
def _urljoin(base, uri):
285
uri = _urifixer.sub(r'\1\3', uri)
286
return urlparse.urljoin(base, uri)
288
class _FeedParserMixin:
289
namespaces = {'': '',
290
'http://backend.userland.com/rss': '',
291
'http://blogs.law.harvard.edu/tech/rss': '',
292
'http://purl.org/rss/1.0/': '',
293
'http://my.netscape.com/rdf/simple/0.9/': '',
294
'http://example.com/newformat#': '',
295
'http://example.com/necho': '',
296
'http://purl.org/echo/': '',
297
'uri/of/echo/namespace#': '',
298
'http://purl.org/pie/': '',
299
'http://purl.org/atom/ns#': '',
300
'http://www.w3.org/2005/Atom': '',
301
'http://purl.org/rss/1.0/modules/rss091#': '',
303
'http://webns.net/mvcb/': 'admin',
304
'http://purl.org/rss/1.0/modules/aggregation/': 'ag',
305
'http://purl.org/rss/1.0/modules/annotate/': 'annotate',
306
'http://media.tangent.org/rss/1.0/': 'audio',
307
'http://backend.userland.com/blogChannelModule': 'blogChannel',
308
'http://web.resource.org/cc/': 'cc',
309
'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons',
310
'http://purl.org/rss/1.0/modules/company': 'co',
311
'http://purl.org/rss/1.0/modules/content/': 'content',
312
'http://my.theinfo.org/changed/1.0/rss/': 'cp',
313
'http://purl.org/dc/elements/1.1/': 'dc',
314
'http://purl.org/dc/terms/': 'dcterms',
315
'http://purl.org/rss/1.0/modules/email/': 'email',
316
'http://purl.org/rss/1.0/modules/event/': 'ev',
317
'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner',
318
'http://freshmeat.net/rss/fm/': 'fm',
319
'http://xmlns.com/foaf/0.1/': 'foaf',
320
'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo',
321
'http://postneo.com/icbm/': 'icbm',
322
'http://purl.org/rss/1.0/modules/image/': 'image',
323
'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes',
324
'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes',
325
'http://purl.org/rss/1.0/modules/link/': 'l',
326
'http://search.yahoo.com/mrss': 'media',
327
'http://madskills.com/public/xml/rss/module/pingback/': 'pingback',
328
'http://prismstandard.org/namespaces/1.2/basic/': 'prism',
329
'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf',
330
'http://www.w3.org/2000/01/rdf-schema#': 'rdfs',
331
'http://purl.org/rss/1.0/modules/reference/': 'ref',
332
'http://purl.org/rss/1.0/modules/richequiv/': 'reqv',
333
'http://purl.org/rss/1.0/modules/search/': 'search',
334
'http://purl.org/rss/1.0/modules/slash/': 'slash',
335
'http://schemas.xmlsoap.org/soap/envelope/': 'soap',
336
'http://purl.org/rss/1.0/modules/servicestatus/': 'ss',
337
'http://hacks.benhammersley.com/rss/streaming/': 'str',
338
'http://purl.org/rss/1.0/modules/subscription/': 'sub',
339
'http://purl.org/rss/1.0/modules/syndication/': 'sy',
340
'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo',
341
'http://purl.org/rss/1.0/modules/threading/': 'thr',
342
'http://purl.org/rss/1.0/modules/textinput/': 'ti',
343
'http://madskills.com/public/xml/rss/module/trackback/':'trackback',
344
'http://wellformedweb.org/commentAPI/': 'wfw',
345
'http://purl.org/rss/1.0/modules/wiki/': 'wiki',
346
'http://www.w3.org/1999/xhtml': 'xhtml',
347
'http://www.w3.org/XML/1998/namespace': 'xml',
348
'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf'
350
_matchnamespaces = {}
352
can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo']
353
can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
354
can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description']
355
html_types = ['text/html', 'application/xhtml+xml']
357
def __init__(self, baseuri=None, baselang=None, encoding='utf-8'):
358
if _debug: sys.stderr.write('initializing FeedParser\n')
359
if not self._matchnamespaces:
360
for k, v in self.namespaces.items():
361
self._matchnamespaces[k.lower()] = v
362
self.feeddata = FeedParserDict() # feed-level data
363
self.encoding = encoding # character encoding
364
self.entries = [] # list of entry-level data
365
self.version = '' # feed type/version, see SUPPORTED_VERSIONS
366
self.namespacesInUse = {} # dictionary of namespaces defined by the feed
368
# the following are used internally to track state;
369
# this is really out of control and should be refactored
376
self.incontributor = 0
379
self.sourcedata = FeedParserDict()
380
self.contentparams = FeedParserDict()
381
self._summaryKey = None
382
self.namespacemap = {}
383
self.elementstack = []
386
self.baseuri = baseuri or ''
387
self.lang = baselang or None
389
self.feeddata['language'] = baselang
391
def unknown_starttag(self, tag, attrs):
392
if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs))
394
attrs = [(k.lower(), v) for k, v in attrs]
395
attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
397
# track xml:base and xml:lang
399
baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri
400
self.baseuri = _urljoin(self.baseuri, baseuri)
401
lang = attrsD.get('xml:lang', attrsD.get('lang'))
403
# xml:lang could be explicitly set to '', we need to capture that
406
# if no xml:lang is specified, use parent lang
409
if tag in ('feed', 'rss', 'rdf:RDF'):
410
self.feeddata['language'] = lang
412
self.basestack.append(self.baseuri)
413
self.langstack.append(lang)
416
for prefix, uri in attrs:
417
if prefix.startswith('xmlns:'):
418
self.trackNamespace(prefix[6:], uri)
419
elif prefix == 'xmlns':
420
self.trackNamespace(None, uri)
422
# track inline content
423
if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
424
# element declared itself as escaped markup, but it isn't really
425
self.contentparams['type'] = 'application/xhtml+xml'
426
if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
427
# Note: probably shouldn't simply recreate localname here, but
428
# our namespace handling isn't actually 100% correct in cases where
429
# the feed redefines the default namespace (which is actually
430
# the usual case for inline content, thanks Sam), so here we
431
# cheat and just reconstruct the element based on localname
432
# because that compensates for the bugs in our namespace handling.
433
# This will horribly munge inline content with non-empty qnames,
434
# but nobody actually does that, so I'm not fixing it.
435
tag = tag.split(':')[-1]
436
return self.handle_data('<%s%s>' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0)
439
if tag.find(':') <> -1:
440
prefix, suffix = tag.split(':', 1)
442
prefix, suffix = '', tag
443
prefix = self.namespacemap.get(prefix, prefix)
445
prefix = prefix + '_'
447
# special hack for better tracking of empty textinput/image elements in illformed feeds
448
if (not prefix) and tag not in ('title', 'link', 'description', 'name'):
450
if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'):
453
# call special handler (if defined) or default handler
454
methodname = '_start_' + prefix + suffix
456
method = getattr(self, methodname)
457
return method(attrsD)
458
except AttributeError:
459
return self.push(prefix + suffix, 1)
461
def unknown_endtag(self, tag):
462
if _debug: sys.stderr.write('end %s\n' % tag)
464
if tag.find(':') <> -1:
465
prefix, suffix = tag.split(':', 1)
467
prefix, suffix = '', tag
468
prefix = self.namespacemap.get(prefix, prefix)
470
prefix = prefix + '_'
472
# call special handler (if defined) or default handler
473
methodname = '_end_' + prefix + suffix
475
method = getattr(self, methodname)
477
except AttributeError:
478
self.pop(prefix + suffix)
480
# track inline content
481
if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
482
# element declared itself as escaped markup, but it isn't really
483
self.contentparams['type'] = 'application/xhtml+xml'
484
if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml':
485
tag = tag.split(':')[-1]
486
self.handle_data('</%s>' % tag, escape=0)
488
# track xml:base and xml:lang going out of scope
491
if self.basestack and self.basestack[-1]:
492
self.baseuri = self.basestack[-1]
495
if self.langstack: # and (self.langstack[-1] is not None):
496
self.lang = self.langstack[-1]
498
def handle_charref(self, ref):
499
# called for each character reference, e.g. for ' ', ref will be '160'
500
if not self.elementstack: return
502
if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
509
text = unichr(c).encode('utf-8')
510
self.elementstack[-1][2].append(text)
512
def handle_entityref(self, ref):
513
# called for each entity reference, e.g. for '©', ref will be 'copy'
514
if not self.elementstack: return
515
if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref)
516
if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
519
# entity resolution graciously donated by Aaron Swartz
521
import htmlentitydefs
522
if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
523
return htmlentitydefs.name2codepoint[k]
524
k = htmlentitydefs.entitydefs[k]
525
if k.startswith('&#') and k.endswith(';'):
526
return int(k[2:-1]) # not in latin-1
529
except KeyError: text = '&%s;' % ref
530
else: text = unichr(name2cp(ref)).encode('utf-8')
531
self.elementstack[-1][2].append(text)
533
def handle_data(self, text, escape=1):
534
# called for each block of plain text, i.e. outside of any tag and
535
# not containing any character or entity references
536
if not self.elementstack: return
537
if escape and self.contentparams.get('type') == 'application/xhtml+xml':
538
text = _xmlescape(text)
539
self.elementstack[-1][2].append(text)
541
def handle_comment(self, text):
542
# called for each comment, e.g. <!-- insert message here -->
545
def handle_pi(self, text):
546
# called for each processing instruction, e.g. <?instruction>
549
def handle_decl(self, text):
552
def parse_declaration(self, i):
553
# override internal declaration handler to handle CDATA blocks
554
if _debug: sys.stderr.write('entering parse_declaration\n')
555
if self.rawdata[i:i+9] == '<![CDATA[':
556
k = self.rawdata.find(']]>', i)
557
if k == -1: k = len(self.rawdata)
558
self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0)
561
k = self.rawdata.find('>', i)
564
def mapContentType(self, contentType):
565
contentType = contentType.lower()
566
if contentType == 'text':
567
contentType = 'text/plain'
568
elif contentType == 'html':
569
contentType = 'text/html'
570
elif contentType == 'xhtml':
571
contentType = 'application/xhtml+xml'
574
def trackNamespace(self, prefix, uri):
575
loweruri = uri.lower()
576
if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version:
577
self.version = 'rss090'
578
if loweruri == 'http://purl.org/rss/1.0/' and not self.version:
579
self.version = 'rss10'
580
if loweruri == 'http://www.w3.org/2005/atom' and not self.version:
581
self.version = 'atom10'
582
if loweruri.find('backend.userland.com/rss') <> -1:
583
# match any backend.userland.com namespace
584
uri = 'http://backend.userland.com/rss'
586
if self._matchnamespaces.has_key(loweruri):
587
self.namespacemap[prefix] = self._matchnamespaces[loweruri]
588
self.namespacesInUse[self._matchnamespaces[loweruri]] = uri
590
self.namespacesInUse[prefix or ''] = uri
592
def resolveURI(self, uri):
593
return _urljoin(self.baseuri or '', uri)
595
def decodeEntities(self, element, data):
598
def push(self, element, expectingText):
599
self.elementstack.append([element, expectingText, []])
601
def pop(self, element, stripWhitespace=1):
602
if not self.elementstack: return
603
if self.elementstack[-1][0] != element: return
605
element, expectingText, pieces = self.elementstack.pop()
606
output = ''.join(pieces)
608
output = output.strip()
609
if not expectingText: return output
611
# decode base64 content
612
if base64 and self.contentparams.get('base64', 0):
614
output = base64.decodestring(output)
615
except binascii.Error:
617
except binascii.Incomplete:
620
# resolve relative URIs
621
if (element in self.can_be_relative_uri) and output:
622
output = self.resolveURI(output)
624
# decode entities within embedded markup
625
if not self.contentparams.get('base64', 0):
626
output = self.decodeEntities(element, output)
628
# remove temporary cruft from contentparams
630
del self.contentparams['mode']
634
del self.contentparams['base64']
638
# resolve relative URIs within embedded markup
639
if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
640
if element in self.can_contain_relative_uris:
641
output = _resolveRelativeURIs(output, self.baseuri, self.encoding)
643
# sanitize embedded markup
644
if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types:
645
if element in self.can_contain_dangerous_markup:
646
output = _sanitizeHTML(output, self.encoding)
648
if self.encoding and type(output) != type(u''):
650
output = unicode(output, self.encoding)
654
# categories/tags/keywords/whatever are handled in _end_category
655
if element == 'category':
658
# store output in appropriate place(s)
659
if self.inentry and not self.insource:
660
if element == 'content':
661
self.entries[-1].setdefault(element, [])
662
contentparams = copy.deepcopy(self.contentparams)
663
contentparams['value'] = output
664
self.entries[-1][element].append(contentparams)
665
elif element == 'link':
666
self.entries[-1][element] = output
668
self.entries[-1]['links'][-1]['href'] = output
670
if element == 'description':
672
self.entries[-1][element] = output
674
contentparams = copy.deepcopy(self.contentparams)
675
contentparams['value'] = output
676
self.entries[-1][element + '_detail'] = contentparams
677
elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage):
678
context = self._getContext()
679
if element == 'description':
681
context[element] = output
682
if element == 'link':
683
context['links'][-1]['href'] = output
685
contentparams = copy.deepcopy(self.contentparams)
686
contentparams['value'] = output
687
context[element + '_detail'] = contentparams
690
def pushContent(self, tag, attrsD, defaultContentType, expectingText):
692
self.contentparams = FeedParserDict({
693
'type': self.mapContentType(attrsD.get('type', defaultContentType)),
694
'language': self.lang,
695
'base': self.baseuri})
696
self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams)
697
self.push(tag, expectingText)
699
def popContent(self, tag):
700
value = self.pop(tag)
702
self.contentparams.clear()
705
def _mapToStandardPrefix(self, name):
706
colonpos = name.find(':')
708
prefix = name[:colonpos]
709
suffix = name[colonpos+1:]
710
prefix = self.namespacemap.get(prefix, prefix)
711
name = prefix + ':' + suffix
714
def _getAttribute(self, attrsD, name):
715
return attrsD.get(self._mapToStandardPrefix(name))
717
def _isBase64(self, attrsD, contentparams):
718
if attrsD.get('mode', '') == 'base64':
720
if self.contentparams['type'].startswith('text/'):
722
if self.contentparams['type'].endswith('+xml'):
724
if self.contentparams['type'].endswith('/xml'):
728
def _itsAnHrefDamnIt(self, attrsD):
729
href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None)))
739
attrsD['href'] = href
742
def _save(self, key, value):
743
context = self._getContext()
744
context.setdefault(key, value)
746
def _start_rss(self, attrsD):
747
versionmap = {'0.91': 'rss091u',
752
attr_version = attrsD.get('version', '')
753
version = versionmap.get(attr_version)
755
self.version = version
756
elif attr_version.startswith('2.'):
757
self.version = 'rss20'
761
def _start_dlhottitles(self, attrsD):
762
self.version = 'hotrss'
764
def _start_channel(self, attrsD):
766
self._cdf_common(attrsD)
767
_start_feedinfo = _start_channel
769
def _cdf_common(self, attrsD):
770
if attrsD.has_key('lastmod'):
771
self._start_modified({})
772
self.elementstack[-1][-1] = attrsD['lastmod']
774
if attrsD.has_key('href'):
776
self.elementstack[-1][-1] = attrsD['href']
779
def _start_feed(self, attrsD):
781
versionmap = {'0.1': 'atom01',
785
attr_version = attrsD.get('version')
786
version = versionmap.get(attr_version)
788
self.version = version
790
self.version = 'atom'
792
def _end_channel(self):
794
_end_feed = _end_channel
796
def _start_image(self, attrsD):
798
self.push('image', 0)
799
context = self._getContext()
800
context.setdefault('image', FeedParserDict())
802
def _end_image(self):
806
def _start_textinput(self, attrsD):
808
self.push('textinput', 0)
809
context = self._getContext()
810
context.setdefault('textinput', FeedParserDict())
811
_start_textInput = _start_textinput
813
def _end_textinput(self):
814
self.pop('textinput')
816
_end_textInput = _end_textinput
818
def _start_author(self, attrsD):
820
self.push('author', 1)
821
_start_managingeditor = _start_author
822
_start_dc_author = _start_author
823
_start_dc_creator = _start_author
824
_start_itunes_author = _start_author
826
def _end_author(self):
829
self._sync_author_detail()
830
_end_managingeditor = _end_author
831
_end_dc_author = _end_author
832
_end_dc_creator = _end_author
833
_end_itunes_author = _end_author
835
def _start_itunes_owner(self, attrsD):
837
self.push('publisher', 0)
839
def _end_itunes_owner(self):
840
self.pop('publisher')
842
self._sync_author_detail('publisher')
844
def _start_contributor(self, attrsD):
845
self.incontributor = 1
846
context = self._getContext()
847
context.setdefault('contributors', [])
848
context['contributors'].append(FeedParserDict())
849
self.push('contributor', 0)
851
def _end_contributor(self):
852
self.pop('contributor')
853
self.incontributor = 0
855
def _start_dc_contributor(self, attrsD):
856
self.incontributor = 1
857
context = self._getContext()
858
context.setdefault('contributors', [])
859
context['contributors'].append(FeedParserDict())
862
def _end_dc_contributor(self):
864
self.incontributor = 0
866
def _start_name(self, attrsD):
868
_start_itunes_name = _start_name
871
value = self.pop('name')
873
self._save_author('name', value, 'publisher')
875
self._save_author('name', value)
876
elif self.incontributor:
877
self._save_contributor('name', value)
878
elif self.intextinput:
879
context = self._getContext()
880
context['textinput']['name'] = value
881
_end_itunes_name = _end_name
883
def _start_width(self, attrsD):
884
self.push('width', 0)
886
def _end_width(self):
887
value = self.pop('width')
893
context = self._getContext()
894
context['image']['width'] = value
896
def _start_height(self, attrsD):
897
self.push('height', 0)
899
def _end_height(self):
900
value = self.pop('height')
906
context = self._getContext()
907
context['image']['height'] = value
909
def _start_url(self, attrsD):
911
_start_homepage = _start_url
912
_start_uri = _start_url
915
value = self.pop('href')
917
self._save_author('href', value)
918
elif self.incontributor:
919
self._save_contributor('href', value)
921
context = self._getContext()
922
context['image']['href'] = value
923
elif self.intextinput:
924
context = self._getContext()
925
context['textinput']['link'] = value
926
_end_homepage = _end_url
929
def _start_email(self, attrsD):
930
self.push('email', 0)
931
_start_itunes_email = _start_email
933
def _end_email(self):
934
value = self.pop('email')
936
self._save_author('email', value, 'publisher')
938
self._save_author('email', value)
939
elif self.incontributor:
940
self._save_contributor('email', value)
941
_end_itunes_email = _end_email
943
def _getContext(self):
945
context = self.sourcedata
947
context = self.entries[-1]
949
context = self.feeddata
952
def _save_author(self, key, value, prefix='author'):
953
context = self._getContext()
954
context.setdefault(prefix + '_detail', FeedParserDict())
955
context[prefix + '_detail'][key] = value
956
self._sync_author_detail()
958
def _save_contributor(self, key, value):
959
context = self._getContext()
960
context.setdefault('contributors', [FeedParserDict()])
961
context['contributors'][-1][key] = value
963
def _sync_author_detail(self, key='author'):
964
context = self._getContext()
965
detail = context.get('%s_detail' % key)
967
name = detail.get('name')
968
email = detail.get('email')
970
context[key] = '%s (%s)' % (name, email)
976
author = context.get(key)
977
if not author: return
978
emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author)
979
if not emailmatch: return
980
email = emailmatch.group(0)
981
# probably a better way to do the following, but it passes all the tests
982
author = author.replace(email, '')
983
author = author.replace('()', '')
984
author = author.strip()
985
if author and (author[0] == '('):
987
if author and (author[-1] == ')'):
989
author = author.strip()
990
context.setdefault('%s_detail' % key, FeedParserDict())
991
context['%s_detail' % key]['name'] = author
992
context['%s_detail' % key]['email'] = email
994
def _start_subtitle(self, attrsD):
995
self.pushContent('subtitle', attrsD, 'text/plain', 1)
996
_start_tagline = _start_subtitle
997
_start_itunes_subtitle = _start_subtitle
999
def _end_subtitle(self):
1000
self.popContent('subtitle')
1001
_end_tagline = _end_subtitle
1002
_end_itunes_subtitle = _end_subtitle
1004
def _start_rights(self, attrsD):
1005
self.pushContent('rights', attrsD, 'text/plain', 1)
1006
_start_dc_rights = _start_rights
1007
_start_copyright = _start_rights
1009
def _end_rights(self):
1010
self.popContent('rights')
1011
_end_dc_rights = _end_rights
1012
_end_copyright = _end_rights
1014
def _start_item(self, attrsD):
1015
self.entries.append(FeedParserDict())
1016
self.push('item', 0)
1019
id = self._getAttribute(attrsD, 'rdf:about')
1021
context = self._getContext()
1023
self._cdf_common(attrsD)
1024
_start_entry = _start_item
1025
_start_product = _start_item
1027
def _end_item(self):
1030
_end_entry = _end_item
1032
def _start_dc_language(self, attrsD):
1033
self.push('language', 1)
1034
_start_language = _start_dc_language
1036
def _end_dc_language(self):
1037
self.lang = self.pop('language')
1038
_end_language = _end_dc_language
1040
def _start_dc_publisher(self, attrsD):
1041
self.push('publisher', 1)
1042
_start_webmaster = _start_dc_publisher
1044
def _end_dc_publisher(self):
1045
self.pop('publisher')
1046
self._sync_author_detail('publisher')
1047
_end_webmaster = _end_dc_publisher
1049
def _start_published(self, attrsD):
1050
self.push('published', 1)
1051
_start_dcterms_issued = _start_published
1052
_start_issued = _start_published
1054
def _end_published(self):
1055
value = self.pop('published')
1056
self._save('published_parsed', _parse_date(value))
1057
_end_dcterms_issued = _end_published
1058
_end_issued = _end_published
1060
def _start_updated(self, attrsD):
1061
self.push('updated', 1)
1062
_start_modified = _start_updated
1063
_start_dcterms_modified = _start_updated
1064
_start_pubdate = _start_updated
1065
_start_dc_date = _start_updated
1067
def _end_updated(self):
1068
value = self.pop('updated')
1069
parsed_value = _parse_date(value)
1070
self._save('updated_parsed', parsed_value)
1071
_end_modified = _end_updated
1072
_end_dcterms_modified = _end_updated
1073
_end_pubdate = _end_updated
1074
_end_dc_date = _end_updated
1076
def _start_created(self, attrsD):
1077
self.push('created', 1)
1078
_start_dcterms_created = _start_created
1080
def _end_created(self):
1081
value = self.pop('created')
1082
self._save('created_parsed', _parse_date(value))
1083
_end_dcterms_created = _end_created
1085
def _start_expirationdate(self, attrsD):
1086
self.push('expired', 1)
1088
def _end_expirationdate(self):
1089
self._save('expired_parsed', _parse_date(self.pop('expired')))
1091
def _start_cc_license(self, attrsD):
1092
self.push('license', 1)
1093
value = self._getAttribute(attrsD, 'rdf:resource')
1095
self.elementstack[-1][2].append(value)
1098
def _start_creativecommons_license(self, attrsD):
1099
self.push('license', 1)
1101
def _end_creativecommons_license(self):
1104
def _addTag(self, term, scheme, label):
1105
context = self._getContext()
1106
tags = context.setdefault('tags', [])
1107
if (not term) and (not scheme) and (not label): return
1108
value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})
1109
if value not in tags:
1110
tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label}))
1112
def _start_category(self, attrsD):
1113
if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))
1114
term = attrsD.get('term')
1115
scheme = attrsD.get('scheme', attrsD.get('domain'))
1116
label = attrsD.get('label')
1117
self._addTag(term, scheme, label)
1118
self.push('category', 1)
1119
_start_dc_subject = _start_category
1120
_start_keywords = _start_category
1122
def _end_itunes_keywords(self):
1123
for term in self.pop('itunes_keywords').split():
1124
self._addTag(term, 'http://www.itunes.com/', None)
1126
def _start_itunes_category(self, attrsD):
1127
self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)
1128
self.push('category', 1)
1130
def _end_category(self):
1131
value = self.pop('category')
1132
if not value: return
1133
context = self._getContext()
1134
tags = context['tags']
1135
if value and len(tags) and not tags[-1]['term']:
1136
tags[-1]['term'] = value
1138
self._addTag(value, None, None)
1139
_end_dc_subject = _end_category
1140
_end_keywords = _end_category
1141
_end_itunes_category = _end_category
1143
def _start_cloud(self, attrsD):
1144
self._getContext()['cloud'] = FeedParserDict(attrsD)
1146
def _start_link(self, attrsD):
1147
attrsD.setdefault('rel', 'alternate')
1148
attrsD.setdefault('type', 'text/html')
1149
attrsD = self._itsAnHrefDamnIt(attrsD)
1150
if attrsD.has_key('href'):
1151
attrsD['href'] = self.resolveURI(attrsD['href'])
1152
expectingText = self.infeed or self.inentry or self.insource
1153
context = self._getContext()
1154
context.setdefault('links', [])
1155
context['links'].append(FeedParserDict(attrsD))
1156
if attrsD['rel'] == 'enclosure':
1157
self._start_enclosure(attrsD)
1158
if attrsD.has_key('href'):
1160
if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):
1161
context['link'] = attrsD['href']
1163
self.push('link', expectingText)
1164
_start_producturl = _start_link
1166
def _end_link(self):
1167
value = self.pop('link')
1168
context = self._getContext()
1169
if self.intextinput:
1170
context['textinput']['link'] = value
1172
context['image']['link'] = value
1173
_end_producturl = _end_link
1175
def _start_guid(self, attrsD):
1176
self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')
1179
def _end_guid(self):
1180
value = self.pop('id')
1181
self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))
1183
# guid acts as link, but only if 'ispermalink' is not present or is 'true',
1184
# and only if the item doesn't already have a link element
1185
self._save('link', value)
1187
def _start_title(self, attrsD):
1188
self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1189
_start_dc_title = _start_title
1190
_start_media_title = _start_title
1192
def _end_title(self):
1193
value = self.popContent('title')
1194
context = self._getContext()
1195
if self.intextinput:
1196
context['textinput']['title'] = value
1198
context['image']['title'] = value
1199
_end_dc_title = _end_title
1200
_end_media_title = _end_title
1202
def _start_description(self, attrsD):
1203
context = self._getContext()
1204
if context.has_key('summary'):
1205
self._summaryKey = 'content'
1206
self._start_content(attrsD)
1208
self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)
1210
def _start_abstract(self, attrsD):
1211
self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)
1213
def _end_description(self):
1214
if self._summaryKey == 'content':
1217
value = self.popContent('description')
1218
context = self._getContext()
1219
if self.intextinput:
1220
context['textinput']['description'] = value
1222
context['image']['description'] = value
1223
self._summaryKey = None
1224
_end_abstract = _end_description
1226
def _start_info(self, attrsD):
1227
self.pushContent('info', attrsD, 'text/plain', 1)
1228
_start_feedburner_browserfriendly = _start_info
1230
def _end_info(self):
1231
self.popContent('info')
1232
_end_feedburner_browserfriendly = _end_info
1234
def _start_generator(self, attrsD):
1236
attrsD = self._itsAnHrefDamnIt(attrsD)
1237
if attrsD.has_key('href'):
1238
attrsD['href'] = self.resolveURI(attrsD['href'])
1239
self._getContext()['generator_detail'] = FeedParserDict(attrsD)
1240
self.push('generator', 1)
1242
def _end_generator(self):
1243
value = self.pop('generator')
1244
context = self._getContext()
1245
if context.has_key('generator_detail'):
1246
context['generator_detail']['name'] = value
1248
def _start_admin_generatoragent(self, attrsD):
1249
self.push('generator', 1)
1250
value = self._getAttribute(attrsD, 'rdf:resource')
1252
self.elementstack[-1][2].append(value)
1253
self.pop('generator')
1254
self._getContext()['generator_detail'] = FeedParserDict({'href': value})
1256
def _start_admin_errorreportsto(self, attrsD):
1257
self.push('errorreportsto', 1)
1258
value = self._getAttribute(attrsD, 'rdf:resource')
1260
self.elementstack[-1][2].append(value)
1261
self.pop('errorreportsto')
1263
def _start_summary(self, attrsD):
1264
context = self._getContext()
1265
if context.has_key('summary'):
1266
self._summaryKey = 'content'
1267
self._start_content(attrsD)
1269
self._summaryKey = 'summary'
1270
self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)
1271
_start_itunes_summary = _start_summary
1273
def _end_summary(self):
1274
if self._summaryKey == 'content':
1277
self.popContent(self._summaryKey or 'summary')
1278
self._summaryKey = None
1279
_end_itunes_summary = _end_summary
1281
def _start_enclosure(self, attrsD):
1282
attrsD = self._itsAnHrefDamnIt(attrsD)
1283
self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD))
1284
href = attrsD.get('href')
1286
context = self._getContext()
1287
if not context.get('id'):
1288
context['id'] = href
1290
def _start_source(self, attrsD):
1293
def _end_source(self):
1295
self._getContext()['source'] = copy.deepcopy(self.sourcedata)
1296
self.sourcedata.clear()
1298
def _start_content(self, attrsD):
1299
self.pushContent('content', attrsD, 'text/plain', 1)
1300
src = attrsD.get('src')
1302
self.contentparams['src'] = src
1303
self.push('content', 1)
1305
def _start_prodlink(self, attrsD):
1306
self.pushContent('content', attrsD, 'text/html', 1)
1308
def _start_body(self, attrsD):
1309
self.pushContent('content', attrsD, 'application/xhtml+xml', 1)
1310
_start_xhtml_body = _start_body
1312
def _start_content_encoded(self, attrsD):
1313
self.pushContent('content', attrsD, 'text/html', 1)
1314
_start_fullitem = _start_content_encoded
1316
def _end_content(self):
1317
copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)
1318
value = self.popContent('content')
1319
if copyToDescription:
1320
self._save('description', value)
1321
_end_body = _end_content
1322
_end_xhtml_body = _end_content
1323
_end_content_encoded = _end_content
1324
_end_fullitem = _end_content
1325
_end_prodlink = _end_content
1327
def _start_itunes_image(self, attrsD):
1328
self.push('itunes_image', 0)
1329
self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})
1330
_start_itunes_link = _start_itunes_image
1332
def _end_itunes_block(self):
1333
value = self.pop('itunes_block', 0)
1334
self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0
1336
def _end_itunes_explicit(self):
1337
value = self.pop('itunes_explicit', 0)
1338
self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0
1341
class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):
1342
def __init__(self, baseuri, baselang, encoding):
1343
if _debug: sys.stderr.write('trying StrictFeedParser\n')
1344
xml.sax.handler.ContentHandler.__init__(self)
1345
_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1349
def startPrefixMapping(self, prefix, uri):
1350
self.trackNamespace(prefix, uri)
1352
def startElementNS(self, name, qname, attrs):
1353
namespace, localname = name
1354
lowernamespace = str(namespace or '').lower()
1355
if lowernamespace.find('backend.userland.com/rss') <> -1:
1356
# match any backend.userland.com namespace
1357
namespace = 'http://backend.userland.com/rss'
1358
lowernamespace = namespace
1359
if qname and qname.find(':') > 0:
1360
givenprefix = qname.split(':')[0]
1363
prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1364
if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):
1365
raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix
1367
localname = prefix + ':' + localname
1368
localname = str(localname).lower()
1369
if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))
1371
# qname implementation is horribly broken in Python 2.1 (it
1372
# doesn't report any), and slightly broken in Python 2.2 (it
1373
# doesn't report the xml: namespace). So we match up namespaces
1374
# with a known list first, and then possibly override them with
1375
# the qnames the SAX parser gives us (if indeed it gives us any
1376
# at all). Thanks to MatejC for helping me test this and
1377
# tirelessly telling me that it didn't work yet.
1379
for (namespace, attrlocalname), attrvalue in attrs._attrs.items():
1380
lowernamespace = (namespace or '').lower()
1381
prefix = self._matchnamespaces.get(lowernamespace, '')
1383
attrlocalname = prefix + ':' + attrlocalname
1384
attrsD[str(attrlocalname).lower()] = attrvalue
1385
for qname in attrs.getQNames():
1386
attrsD[str(qname).lower()] = attrs.getValueByQName(qname)
1387
self.unknown_starttag(localname, attrsD.items())
1389
def characters(self, text):
1390
self.handle_data(text)
1392
def endElementNS(self, name, qname):
1393
namespace, localname = name
1394
lowernamespace = str(namespace or '').lower()
1395
if qname and qname.find(':') > 0:
1396
givenprefix = qname.split(':')[0]
1399
prefix = self._matchnamespaces.get(lowernamespace, givenprefix)
1401
localname = prefix + ':' + localname
1402
localname = str(localname).lower()
1403
self.unknown_endtag(localname)
1405
def error(self, exc):
1409
def fatalError(self, exc):
1413
class _BaseHTMLProcessor(sgmllib.SGMLParser):
1414
elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
1415
'img', 'input', 'isindex', 'link', 'meta', 'param']
1417
def __init__(self, encoding):
1418
self.encoding = encoding
1419
if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)
1420
sgmllib.SGMLParser.__init__(self)
1424
sgmllib.SGMLParser.reset(self)
1426
def _shorttag_replace(self, match):
1427
tag = match.group(1)
1428
if tag in self.elements_no_end_tag:
1429
return '<' + tag + ' />'
1431
return '<' + tag + '></' + tag + '>'
1433
def feed(self, data):
1434
data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)
1435
#data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace
1436
data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data)
1437
data = data.replace(''', "'")
1438
data = data.replace('"', '"')
1439
if self.encoding and type(data) == type(u''):
1440
data = data.encode(self.encoding)
1441
sgmllib.SGMLParser.feed(self, data)
1443
def normalize_attrs(self, attrs):
1444
# utility method to be called by descendants
1445
attrs = [(k.lower(), v) for k, v in attrs]
1446
attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]
1449
def unknown_starttag(self, tag, attrs):
1450
# called for each start tag
1451
# attrs is a list of (attr, value) tuples
1452
# e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]
1453
if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)
1455
# thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds
1456
for key, value in attrs:
1457
if type(value) != type(u''):
1458
value = unicode(value, self.encoding)
1459
uattrs.append((unicode(key, self.encoding), value))
1460
strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)
1461
if tag in self.elements_no_end_tag:
1462
self.pieces.append('<%(tag)s%(strattrs)s />' % locals())
1464
self.pieces.append('<%(tag)s%(strattrs)s>' % locals())
1466
def unknown_endtag(self, tag):
1467
# called for each end tag, e.g. for </pre>, tag will be 'pre'
1468
# Reconstruct the original end tag.
1469
if tag not in self.elements_no_end_tag:
1470
self.pieces.append("</%(tag)s>" % locals())
1472
def handle_charref(self, ref):
1473
# called for each character reference, e.g. for ' ', ref will be '160'
1474
# Reconstruct the original character reference.
1475
self.pieces.append('&#%(ref)s;' % locals())
1477
def handle_entityref(self, ref):
1478
# called for each entity reference, e.g. for '©', ref will be 'copy'
1479
# Reconstruct the original entity reference.
1480
self.pieces.append('&%(ref)s;' % locals())
1482
def handle_data(self, text):
1483
# called for each block of plain text, i.e. outside of any tag and
1484
# not containing any character or entity references
1485
# Store the original text verbatim.
1486
if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)
1487
self.pieces.append(text)
1489
def handle_comment(self, text):
1490
# called for each HTML comment, e.g. <!-- insert Javascript code here -->
1491
# Reconstruct the original comment.
1492
self.pieces.append('<!--%(text)s-->' % locals())
1494
def handle_pi(self, text):
1495
# called for each processing instruction, e.g. <?instruction>
1496
# Reconstruct original processing instruction.
1497
self.pieces.append('<?%(text)s>' % locals())
1499
def handle_decl(self, text):
1500
# called for the DOCTYPE, if present, e.g.
1501
# <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
1502
# "http://www.w3.org/TR/html4/loose.dtd">
1503
# Reconstruct original DOCTYPE
1504
self.pieces.append('<!%(text)s>' % locals())
1506
_new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match
1507
def _scan_name(self, i, declstartpos):
1508
rawdata = self.rawdata
1512
m = self._new_declname_match(rawdata, i)
1516
if (i + len(s)) == n:
1517
return None, -1 # end of buffer
1518
return name.lower(), m.end()
1520
self.handle_data(rawdata)
1521
# self.updatepos(declstartpos, i)
1525
'''Return processed HTML as a single string'''
1526
return ''.join([str(p) for p in self.pieces])
1528
class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):
1529
def __init__(self, baseuri, baselang, encoding):
1530
sgmllib.SGMLParser.__init__(self)
1531
_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
1533
def decodeEntities(self, element, data):
1534
data = data.replace('<', '<')
1535
data = data.replace('<', '<')
1536
data = data.replace('>', '>')
1537
data = data.replace('>', '>')
1538
data = data.replace('&', '&')
1539
data = data.replace('&', '&')
1540
data = data.replace('"', '"')
1541
data = data.replace('"', '"')
1542
data = data.replace(''', ''')
1543
data = data.replace(''', ''')
1544
if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):
1545
data = data.replace('<', '<')
1546
data = data.replace('>', '>')
1547
data = data.replace('&', '&')
1548
data = data.replace('"', '"')
1549
data = data.replace(''', "'")
1552
class _RelativeURIResolver(_BaseHTMLProcessor):
1553
relative_uris = [('a', 'href'),
1554
('applet', 'codebase'),
1556
('blockquote', 'cite'),
1557
('body', 'background'),
1560
('frame', 'longdesc'),
1562
('iframe', 'longdesc'),
1564
('head', 'profile'),
1565
('img', 'longdesc'),
1569
('input', 'usemap'),
1572
('object', 'classid'),
1573
('object', 'codebase'),
1575
('object', 'usemap'),
1579
def __init__(self, baseuri, encoding):
1580
_BaseHTMLProcessor.__init__(self, encoding)
1581
self.baseuri = baseuri
1583
def resolveURI(self, uri):
1584
return _urljoin(self.baseuri, uri)
1586
def unknown_starttag(self, tag, attrs):
1587
attrs = self.normalize_attrs(attrs)
1588
attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]
1589
_BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1591
def _resolveRelativeURIs(htmlSource, baseURI, encoding):
1592
if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')
1593
p = _RelativeURIResolver(baseURI, encoding)
1597
class _HTMLSanitizer(_BaseHTMLProcessor):
1598
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',
1599
'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',
1600
'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',
1601
'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',
1602
'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',
1603
'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',
1604
'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',
1605
'thead', 'tr', 'tt', 'u', 'ul', 'var']
1607
acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
1608
'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',
1609
'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',
1610
'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',
1611
'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',
1612
'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
1613
'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
1614
'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
1615
'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type',
1616
'usemap', 'valign', 'value', 'vspace', 'width']
1618
unacceptable_elements_with_end_tag = ['script', 'applet']
1621
_BaseHTMLProcessor.reset(self)
1622
self.unacceptablestack = 0
1624
def unknown_starttag(self, tag, attrs):
1625
if not tag in self.acceptable_elements:
1626
if tag in self.unacceptable_elements_with_end_tag:
1627
self.unacceptablestack += 1
1629
attrs = self.normalize_attrs(attrs)
1630
attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]
1631
_BaseHTMLProcessor.unknown_starttag(self, tag, attrs)
1633
def unknown_endtag(self, tag):
1634
if not tag in self.acceptable_elements:
1635
if tag in self.unacceptable_elements_with_end_tag:
1636
self.unacceptablestack -= 1
1638
_BaseHTMLProcessor.unknown_endtag(self, tag)
1640
def handle_pi(self, text):
1643
def handle_decl(self, text):
1646
def handle_data(self, text):
1647
if not self.unacceptablestack:
1648
_BaseHTMLProcessor.handle_data(self, text)
1650
def _sanitizeHTML(htmlSource, encoding):
1651
p = _HTMLSanitizer(encoding)
1655
# loop through list of preferred Tidy interfaces looking for one that's installed,
1656
# then set up a common _tidy function to wrap the interface-specific API.
1658
for tidy_interface in PREFERRED_TIDY_INTERFACES:
1660
if tidy_interface == "uTidy":
1661
from tidy import parseString as _utidy
1662
def _tidy(data, **kwargs):
1663
return str(_utidy(data, **kwargs))
1665
elif tidy_interface == "mxTidy":
1666
from mx.Tidy import Tidy as _mxtidy
1667
def _tidy(data, **kwargs):
1668
nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
1674
utf8 = type(data) == type(u'')
1676
data = data.encode('utf-8')
1677
data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
1679
data = unicode(data, 'utf-8')
1680
if data.count('<body'):
1681
data = data.split('<body', 1)[1]
1683
data = data.split('>', 1)[1]
1684
if data.count('</body'):
1685
data = data.split('</body', 1)[0]
1686
data = data.strip().replace('\r\n', '\n')
1689
class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
1690
def http_error_default(self, req, fp, code, msg, headers):
1691
if ((code / 100) == 3) and (code != 304):
1692
return self.http_error_302(req, fp, code, msg, headers)
1693
infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1694
infourl.status = code
1697
def http_error_302(self, req, fp, code, msg, headers):
1698
if headers.dict.has_key('location'):
1699
infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
1701
infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1702
if not hasattr(infourl, 'status'):
1703
infourl.status = code
1706
def http_error_301(self, req, fp, code, msg, headers):
1707
if headers.dict.has_key('location'):
1708
infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)
1710
infourl = urllib.addinfourl(fp, headers, req.get_full_url())
1711
if not hasattr(infourl, 'status'):
1712
infourl.status = code
1715
http_error_300 = http_error_302
1716
http_error_303 = http_error_302
1717
http_error_307 = http_error_302
1719
def http_error_401(self, req, fp, code, msg, headers):
1721
# - server requires digest auth, AND
1722
# - we tried (unsuccessfully) with basic auth, AND
1723
# - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)
1724
# If all conditions hold, parse authentication information
1725
# out of the Authorization header we sent the first time
1726
# (for the username and password) and the WWW-Authenticate
1727
# header the server sent back (for the realm) and retry
1728
# the request with the appropriate digest auth headers instead.
1729
# This evil genius hack has been brought to you by Aaron Swartz.
1730
host = urlparse.urlparse(req.get_full_url())[1]
1732
assert sys.version.split()[0] >= '2.3.3'
1733
assert base64 != None
1734
user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')
1735
realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]
1736
self.add_password(realm, host, user, passw)
1737
retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)
1738
self.reset_retry_count()
1741
return self.http_error_default(req, fp, code, msg, headers)
1743
def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):
1744
"""URL, filename, or string --> stream
1746
This function lets you define parsers that take any input source
1747
(URL, pathname to local or network file, or actual data as a string)
1748
and deal with it in a uniform manner. Returned object is guaranteed
1749
to have all the basic stdio read methods (read, readline, readlines).
1750
Just .close() the object when you're done with it.
1752
If the etag argument is supplied, it will be used as the value of an
1753
If-None-Match request header.
1755
If the modified argument is supplied, it must be a tuple of 9 integers
1756
as returned by gmtime() in the standard Python time module. This MUST
1757
be in GMT (Greenwich Mean Time). The formatted date/time will be used
1758
as the value of an If-Modified-Since request header.
1760
If the agent argument is supplied, it will be used as the value of a
1761
User-Agent request header.
1763
If the referrer argument is supplied, it will be used as the value of a
1764
Referer[sic] request header.
1766
If handlers is supplied, it is a list of handlers used to build a
1770
if hasattr(url_file_stream_or_string, 'read'):
1771
return url_file_stream_or_string
1773
if url_file_stream_or_string == '-':
1776
if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):
1779
# test for inline user:password for basic auth
1782
urltype, rest = urllib.splittype(url_file_stream_or_string)
1783
realhost, rest = urllib.splithost(rest)
1785
user_passwd, realhost = urllib.splituser(realhost)
1787
url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)
1788
auth = base64.encodestring(user_passwd).strip()
1789
# try to open with urllib2 (to use optional headers)
1790
request = urllib2.Request(url_file_stream_or_string)
1791
request.add_header('User-Agent', agent)
1793
request.add_header('If-None-Match', etag)
1795
# format into an RFC 1123-compliant timestamp. We can't use
1796
# time.strftime() since the %a and %b directives can be affected
1797
# by the current locale, but RFC 2616 states that dates must be
1799
short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
1800
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
1801
request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))
1803
request.add_header('Referer', referrer)
1805
request.add_header('Accept-encoding', 'gzip, deflate')
1807
request.add_header('Accept-encoding', 'gzip')
1809
request.add_header('Accept-encoding', 'deflate')
1811
request.add_header('Accept-encoding', '')
1813
request.add_header('Authorization', 'Basic %s' % auth)
1815
request.add_header('Accept', ACCEPT_HEADER)
1816
request.add_header('A-IM', 'feed') # RFC 3229 support
1817
opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))
1818
opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
1820
return opener.open(request)
1822
opener.close() # JohnD
1824
# try to open with native open function (if url_file_stream_or_string is a filename)
1826
return open(url_file_stream_or_string)
1830
# treat url_file_stream_or_string as string
1831
return _StringIO(str(url_file_stream_or_string))
1834
def registerDateHandler(func):
1835
'''Register a date handler function (takes string, returns 9-tuple date in GMT)'''
1836
_date_handlers.insert(0, func)
1838
# ISO-8601 date parsing routines written by Fazal Majid.
1839
# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601
1840
# parser is beyond the scope of feedparser and would be a worthwhile addition
1841
# to the Python library.
1842
# A single regular expression cannot parse ISO 8601 date formats into groups
1843
# as the standard is highly irregular (for instance is 030104 2003-01-04 or
1844
# 0301-04-01), so we use templates instead.
1845
# Please note the order in templates is significant because we need a
1847
_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',
1848
'YY-?MM-?DD', 'YY-?OOO', 'YYYY',
1849
'-YY-?MM', '-OOO', '-YY',
1855
'YYYY', r'(?P<year>\d{4})').replace(
1856
'YY', r'(?P<year>\d\d)').replace(
1857
'MM', r'(?P<month>[01]\d)').replace(
1858
'DD', r'(?P<day>[0123]\d)').replace(
1859
'OOO', r'(?P<ordinal>[0123]\d\d)').replace(
1860
'CC', r'(?P<century>\d\d$)')
1861
+ r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'
1862
+ r'(:(?P<second>\d{2}))?'
1863
+ r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'
1864
for tmpl in _iso8601_tmpl]
1866
_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]
1868
def _parse_date_iso8601(dateString):
1869
'''Parse a variety of ISO-8601-compatible formats like 20040105'''
1871
for _iso8601_match in _iso8601_matches:
1872
m = _iso8601_match(dateString)
1875
if m.span() == (0, 0): return
1876
params = m.groupdict()
1877
ordinal = params.get('ordinal', 0)
1879
ordinal = int(ordinal)
1882
year = params.get('year', '--')
1883
if not year or year == '--':
1884
year = time.gmtime()[0]
1885
elif len(year) == 2:
1886
# ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993
1887
year = 100 * int(time.gmtime()[0] / 100) + int(year)
1890
month = params.get('month', '-')
1891
if not month or month == '-':
1892
# ordinals are NOT normalized by mktime, we simulate them
1893
# by setting month=1, day=ordinal
1897
month = time.gmtime()[1]
1899
day = params.get('day', 0)
1904
elif params.get('century', 0) or \
1905
params.get('year', 0) or params.get('month', 0):
1908
day = time.gmtime()[2]
1911
# special case of the century - is the first year of the 21st century
1912
# 2000 or 2001 ? The debate goes on...
1913
if 'century' in params.keys():
1914
year = (int(params['century']) - 1) * 100 + 1
1915
# in ISO 8601 most fields are optional
1916
for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:
1917
if not params.get(field, None):
1919
hour = int(params.get('hour', 0))
1920
minute = int(params.get('minute', 0))
1921
second = int(params.get('second', 0))
1922
# weekday is normalized by mktime(), we can ignore it
1924
# daylight savings is complex, but not needed for feedparser's purposes
1925
# as time zones, if specified, include mention of whether it is active
1926
# (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and
1927
# and most implementations have DST bugs
1928
daylight_savings_flag = 0
1929
tm = [year, month, day, hour, minute, second, weekday,
1930
ordinal, daylight_savings_flag]
1931
# ISO 8601 time zone adjustments
1932
tz = params.get('tz')
1933
if tz and tz != 'Z':
1935
tm[3] += int(params.get('tzhour', 0))
1936
tm[4] += int(params.get('tzmin', 0))
1938
tm[3] -= int(params.get('tzhour', 0))
1939
tm[4] -= int(params.get('tzmin', 0))
1942
# Python's time.mktime() is a wrapper around the ANSI C mktime(3c)
1943
# which is guaranteed to normalize d/m/y/h/m/s.
1944
# Many implementations have bugs, but we'll pretend they don't.
1945
return time.localtime(time.mktime(tm))
1946
registerDateHandler(_parse_date_iso8601)
1948
# 8-bit date handling routines written by ytrewq1.
1949
_korean_year = u'\ub144' # b3e2 in euc-kr
1950
_korean_month = u'\uc6d4' # bff9 in euc-kr
1951
_korean_day = u'\uc77c' # c0cf in euc-kr
1952
_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr
1953
_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr
1955
_korean_onblog_date_re = \
1956
re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \
1957
(_korean_year, _korean_month, _korean_day))
1958
_korean_nate_date_re = \
1959
re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \
1960
(_korean_am, _korean_pm))
1961
def _parse_date_onblog(dateString):
1962
'''Parse a string according to the OnBlog 8-bit date format'''
1963
m = _korean_onblog_date_re.match(dateString)
1965
w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
1966
{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
1967
'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
1968
'zonediff': '+09:00'}
1969
if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)
1970
return _parse_date_w3dtf(w3dtfdate)
1971
registerDateHandler(_parse_date_onblog)
1973
def _parse_date_nate(dateString):
1974
'''Parse a string according to the Nate 8-bit date format'''
1975
m = _korean_nate_date_re.match(dateString)
1977
hour = int(m.group(5))
1979
if (ampm == _korean_pm):
1984
w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
1985
{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
1986
'hour': hour, 'minute': m.group(6), 'second': m.group(7),\
1987
'zonediff': '+09:00'}
1988
if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)
1989
return _parse_date_w3dtf(w3dtfdate)
1990
registerDateHandler(_parse_date_nate)
1993
re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')
1994
def _parse_date_mssql(dateString):
1995
'''Parse a string according to the MS SQL date format'''
1996
m = _mssql_date_re.match(dateString)
1998
w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \
1999
{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\
2000
'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\
2001
'zonediff': '+09:00'}
2002
if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)
2003
return _parse_date_w3dtf(w3dtfdate)
2004
registerDateHandler(_parse_date_mssql)
2006
# Unicode strings for Greek date strings
2009
u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7
2010
u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7
2011
u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7
2012
u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7
2013
u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7
2014
u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7
2015
u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7
2016
u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7
2017
u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7
2018
u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7
2019
u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7
2020
u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7
2021
u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7
2022
u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7
2023
u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7
2024
u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7
2025
u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7
2026
u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7
2027
u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7
2032
u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7
2033
u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7
2034
u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7
2035
u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7
2036
u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7
2037
u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7
2038
u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7
2041
_greek_date_format_re = \
2042
re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')
2044
def _parse_date_greek(dateString):
2045
'''Parse a string according to a Greek 8-bit date format.'''
2046
m = _greek_date_format_re.match(dateString)
2049
wday = _greek_wdays[m.group(1)]
2050
month = _greek_months[m.group(3)]
2053
rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \
2054
{'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\
2055
'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\
2056
'zonediff': m.group(8)}
2057
if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)
2058
return _parse_date_rfc822(rfc822date)
2059
registerDateHandler(_parse_date_greek)
2061
# Unicode strings for Hungarian date strings
2062
_hungarian_months = \
2064
u'janu\u00e1r': u'01', # e1 in iso-8859-2
2065
u'febru\u00e1ri': u'02', # e1 in iso-8859-2
2066
u'm\u00e1rcius': u'03', # e1 in iso-8859-2
2067
u'\u00e1prilis': u'04', # e1 in iso-8859-2
2068
u'm\u00e1ujus': u'05', # e1 in iso-8859-2
2069
u'j\u00fanius': u'06', # fa in iso-8859-2
2070
u'j\u00falius': u'07', # fa in iso-8859-2
2071
u'augusztus': u'08',
2072
u'szeptember': u'09',
2073
u'okt\u00f3ber': u'10', # f3 in iso-8859-2
2078
_hungarian_date_format_re = \
2079
re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')
2081
def _parse_date_hungarian(dateString):
2082
'''Parse a string according to a Hungarian 8-bit date format.'''
2083
m = _hungarian_date_format_re.match(dateString)
2086
month = _hungarian_months[m.group(2)]
2095
w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \
2096
{'year': m.group(1), 'month': month, 'day': day,\
2097
'hour': hour, 'minute': m.group(5),\
2098
'zonediff': m.group(6)}
2099
if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)
2100
return _parse_date_w3dtf(w3dtfdate)
2101
registerDateHandler(_parse_date_hungarian)
2103
# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by
2104
# Drake and licensed under the Python license. Removed all range checking
2105
# for month, day, hour, minute, and second, since mktime will normalize
2107
def _parse_date_w3dtf(dateString):
2108
def __extract_date(m):
2109
year = int(m.group('year'))
2111
year = 100 * int(time.gmtime()[0] / 100) + int(year)
2114
julian = m.group('julian')
2116
julian = int(julian)
2117
month = julian / 30 + 1
2118
day = julian % 30 + 1
2120
while jday != julian:
2121
t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))
2122
jday = time.gmtime(t)[-2]
2123
diff = abs(jday - julian)
2135
return year, month, day
2136
month = m.group('month')
2142
day = m.group('day')
2147
return year, month, day
2149
def __extract_time(m):
2152
hours = m.group('hours')
2156
minutes = int(m.group('minutes'))
2157
seconds = m.group('seconds')
2159
seconds = int(seconds)
2162
return hours, minutes, seconds
2164
def __extract_tzd(m):
2165
'''Return the Time Zone Designator as an offset in seconds from UTC.'''
2168
tzd = m.group('tzd')
2173
hours = int(m.group('tzdhours'))
2174
minutes = m.group('tzdminutes')
2176
minutes = int(minutes)
2179
offset = (hours*60 + minutes) * 60
2184
__date_re = ('(?P<year>\d\d\d\d)'
2186
'(?:(?P<julian>\d\d\d)'
2187
'|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?')
2188
__tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'
2189
__tzd_rx = re.compile(__tzd_re)
2190
__time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'
2191
'(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?'
2193
__datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)
2194
__datetime_rx = re.compile(__datetime_re)
2195
m = __datetime_rx.match(dateString)
2196
if (m is None) or (m.group() != dateString): return
2197
gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)
2198
if gmt[0] == 0: return
2199
return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)
2200
registerDateHandler(_parse_date_w3dtf)
2202
def _parse_date_rfc822(dateString):
2203
'''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''
2204
data = dateString.split()
2205
if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:
2211
data[3:] = [s[:i], s[i+1:]]
2214
dateString = " ".join(data)
2216
dateString += ' 00:00:00 GMT'
2217
tm = rfc822.parsedate_tz(dateString)
2219
return time.gmtime(rfc822.mktime_tz(tm))
2220
# rfc822.py defines several time zones, but we define some extra ones.
2221
# 'ET' is equivalent to 'EST', etc.
2222
_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}
2223
rfc822._timezones.update(_additional_timezones)
2224
registerDateHandler(_parse_date_rfc822)
2226
def _parse_date(dateString):
2227
'''Parses a variety of date formats into a 9-tuple in GMT'''
2228
for handler in _date_handlers:
2230
date9tuple = handler(dateString)
2231
if not date9tuple: continue
2232
if len(date9tuple) != 9:
2233
if _debug: sys.stderr.write('date handler function must return 9-tuple\n')
2235
map(int, date9tuple)
2237
except Exception, e:
2238
if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))
2242
def _getCharacterEncoding(http_headers, xml_data):
2243
'''Get the character encoding of the XML document
2245
http_headers is a dictionary
2246
xml_data is a raw string (not Unicode)
2248
This is so much trickier than it sounds, it's not even funny.
2249
According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type
2250
is application/xml, application/*+xml,
2251
application/xml-external-parsed-entity, or application/xml-dtd,
2252
the encoding given in the charset parameter of the HTTP Content-Type
2253
takes precedence over the encoding given in the XML prefix within the
2254
document, and defaults to 'utf-8' if neither are specified. But, if
2255
the HTTP Content-Type is text/xml, text/*+xml, or
2256
text/xml-external-parsed-entity, the encoding given in the XML prefix
2257
within the document is ALWAYS IGNORED and only the encoding given in
2258
the charset parameter of the HTTP Content-Type header should be
2259
respected, and it defaults to 'us-ascii' if not specified.
2261
Furthermore, discussion on the atom-syntax mailing list with the
2262
author of RFC 3023 leads me to the conclusion that any document
2263
served with a Content-Type of text/* and no charset parameter
2264
must be treated as us-ascii. (We now do this.) And also that it
2265
must always be flagged as non-well-formed. (We now do this too.)
2267
If Content-Type is unspecified (input was local file or non-HTTP source)
2268
or unrecognized (server just got it totally wrong), then go by the
2269
encoding given in the XML prefix of the document and default to
2270
'iso-8859-1' as per the HTTP specification (RFC 2616).
2272
Then, assuming we didn't find a character encoding in the HTTP headers
2273
(and the HTTP Content-type allowed us to look in the body), we need
2274
to sniff the first few bytes of the XML data and try to determine
2275
whether the encoding is ASCII-compatible. Section F of the XML
2276
specification shows the way here:
2277
http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
2279
If the sniffed encoding is not ASCII-compatible, we need to make it
2280
ASCII compatible so that we can sniff further into the XML declaration
2281
to find the encoding attribute, which will tell us the true encoding.
2283
Of course, none of this guarantees that we will be able to parse the
2284
feed in the declared character encoding (assuming it was declared
2285
correctly, which many are not). CJKCodecs and iconv_codec help a lot;
2286
you should definitely install them if you can.
2287
http://cjkpython.i18n.org/
2290
def _parseHTTPContentType(content_type):
2291
'''takes HTTP Content-Type header and returns (content type, charset)
2293
If no charset is specified, returns (content type, '')
2294
If no content type is specified, returns ('', '')
2295
Both return parameters are guaranteed to be lowercase strings
2297
content_type = content_type or ''
2298
content_type, params = cgi.parse_header(content_type)
2299
return content_type, params.get('charset', '').replace("'", '')
2301
sniffed_xml_encoding = ''
2304
http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))
2305
# Must sniff for non-ASCII-compatible character encodings before
2306
# searching for XML declaration. This heuristic is defined in
2307
# section F of the XML specification:
2308
# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info
2310
if xml_data[:4] == '\x4c\x6f\xa7\x94':
2312
xml_data = _ebcdic_to_ascii(xml_data)
2313
elif xml_data[:4] == '\x00\x3c\x00\x3f':
2315
sniffed_xml_encoding = 'utf-16be'
2316
xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
2317
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):
2319
sniffed_xml_encoding = 'utf-16be'
2320
xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
2321
elif xml_data[:4] == '\x3c\x00\x3f\x00':
2323
sniffed_xml_encoding = 'utf-16le'
2324
xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
2325
elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):
2327
sniffed_xml_encoding = 'utf-16le'
2328
xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
2329
elif xml_data[:4] == '\x00\x00\x00\x3c':
2331
sniffed_xml_encoding = 'utf-32be'
2332
xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
2333
elif xml_data[:4] == '\x3c\x00\x00\x00':
2335
sniffed_xml_encoding = 'utf-32le'
2336
xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
2337
elif xml_data[:4] == '\x00\x00\xfe\xff':
2339
sniffed_xml_encoding = 'utf-32be'
2340
xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
2341
elif xml_data[:4] == '\xff\xfe\x00\x00':
2343
sniffed_xml_encoding = 'utf-32le'
2344
xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
2345
elif xml_data[:3] == '\xef\xbb\xbf':
2347
sniffed_xml_encoding = 'utf-8'
2348
xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
2352
xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
2354
xml_encoding_match = None
2355
if xml_encoding_match:
2356
xml_encoding = xml_encoding_match.groups()[0].lower()
2357
if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):
2358
xml_encoding = sniffed_xml_encoding
2359
acceptable_content_type = 0
2360
application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')
2361
text_content_types = ('text/xml', 'text/xml-external-parsed-entity')
2362
if (http_content_type in application_content_types) or \
2363
(http_content_type.startswith('application/') and http_content_type.endswith('+xml')):
2364
acceptable_content_type = 1
2365
true_encoding = http_encoding or xml_encoding or 'utf-8'
2366
elif (http_content_type in text_content_types) or \
2367
(http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):
2368
acceptable_content_type = 1
2369
true_encoding = http_encoding or 'us-ascii'
2370
elif http_content_type.startswith('text/'):
2371
true_encoding = http_encoding or 'us-ascii'
2372
elif http_headers and (not http_headers.has_key('content-type')):
2373
true_encoding = xml_encoding or 'iso-8859-1'
2375
true_encoding = xml_encoding or 'utf-8'
2376
return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
2378
def _toUTF8(data, encoding):
2379
'''Changes an XML data stream on the fly to specify a new encoding
2381
data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
2382
encoding is a string recognized by encodings.aliases
2384
if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
2385
# strip Byte Order Mark (if present)
2386
if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):
2388
sys.stderr.write('stripping BOM\n')
2389
if encoding != 'utf-16be':
2390
sys.stderr.write('trying utf-16be instead\n')
2391
encoding = 'utf-16be'
2393
elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):
2395
sys.stderr.write('stripping BOM\n')
2396
if encoding != 'utf-16le':
2397
sys.stderr.write('trying utf-16le instead\n')
2398
encoding = 'utf-16le'
2400
elif data[:3] == '\xef\xbb\xbf':
2402
sys.stderr.write('stripping BOM\n')
2403
if encoding != 'utf-8':
2404
sys.stderr.write('trying utf-8 instead\n')
2407
elif data[:4] == '\x00\x00\xfe\xff':
2409
sys.stderr.write('stripping BOM\n')
2410
if encoding != 'utf-32be':
2411
sys.stderr.write('trying utf-32be instead\n')
2412
encoding = 'utf-32be'
2414
elif data[:4] == '\xff\xfe\x00\x00':
2416
sys.stderr.write('stripping BOM\n')
2417
if encoding != 'utf-32le':
2418
sys.stderr.write('trying utf-32le instead\n')
2419
encoding = 'utf-32le'
2421
newdata = unicode(data, encoding)
2422
if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
2423
declmatch = re.compile('^<\?xml[^>]*?>')
2424
newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
2425
if declmatch.search(newdata):
2426
newdata = declmatch.sub(newdecl, newdata)
2428
newdata = newdecl + u'\n' + newdata
2429
return newdata.encode('utf-8')
2431
def _stripDoctype(data):
2432
'''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)
2434
rss_version may be 'rss091n' or None
2435
stripped_data is the same XML document, minus the DOCTYPE
2437
entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE)
2438
data = entity_pattern.sub('', data)
2439
doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)
2440
doctype_results = doctype_pattern.findall(data)
2441
doctype = doctype_results and doctype_results[0] or ''
2442
if doctype.lower().count('netscape'):
2446
data = doctype_pattern.sub('', data)
2447
return version, data
2449
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
2450
'''Parse a feed from a URL, file, stream, or string'''
2451
result = FeedParserDict()
2452
result['feed'] = FeedParserDict()
2453
result['entries'] = []
2456
if type(handlers) == types.InstanceType:
2457
handlers = [handlers]
2459
f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
2461
except Exception, e:
2463
result['bozo_exception'] = e
2467
# if feed is gzip-compressed, decompress it
2468
if f and data and hasattr(f, 'headers'):
2469
if gzip and f.headers.get('content-encoding', '') == 'gzip':
2471
data = gzip.GzipFile(fileobj=_StringIO(data)).read()
2472
except Exception, e:
2473
# Some feeds claim to be gzipped but they're not, so
2474
# we get garbage. Ideally, we should re-request the
2475
# feed without the 'Accept-encoding: gzip' header,
2478
result['bozo_exception'] = e
2480
elif zlib and f.headers.get('content-encoding', '') == 'deflate':
2482
data = zlib.decompress(data, -zlib.MAX_WBITS)
2483
except Exception, e:
2485
result['bozo_exception'] = e
2489
if hasattr(f, 'info'):
2491
result['etag'] = info.getheader('ETag')
2492
last_modified = info.getheader('Last-Modified')
2494
result['modified'] = _parse_date(last_modified)
2495
if hasattr(f, 'url'):
2496
result['href'] = f.url
2497
result['status'] = 200
2498
if hasattr(f, 'status'):
2499
result['status'] = f.status
2500
if hasattr(f, 'headers'):
2501
result['headers'] = f.headers.dict
2502
if hasattr(f, 'close'):
2505
# there are four encodings to keep track of:
2506
# - http_encoding is the encoding declared in the Content-Type HTTP header
2507
# - xml_encoding is the encoding declared in the <?xml declaration
2508
# - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data
2509
# - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications
2510
http_headers = result.get('headers', {})
2511
result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \
2512
_getCharacterEncoding(http_headers, data)
2513
if http_headers and (not acceptable_content_type):
2514
if http_headers.has_key('content-type'):
2515
bozo_message = '%s is not an XML media type' % http_headers['content-type']
2517
bozo_message = 'no Content-type specified'
2519
result['bozo_exception'] = NonXMLContentType(bozo_message)
2521
result['version'], data = _stripDoctype(data)
2523
baseuri = http_headers.get('content-location', result.get('href'))
2524
baselang = http_headers.get('content-language', None)
2526
# if server sent 304, we're done
2527
if result.get('status', 0) == 304:
2528
result['version'] = ''
2529
result['debug_message'] = 'The feed has not changed since you last checked, ' + \
2530
'so the server sent no data. This is a feature, not a bug!'
2533
# if there was a problem downloading, we're done
2537
# determine character encoding
2538
use_strict_parser = 0
2540
tried_encodings = []
2541
# try: HTTP encoding, declared XML encoding, encoding sniffed from BOM
2542
for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):
2543
if not proposed_encoding: continue
2544
if proposed_encoding in tried_encodings: continue
2545
tried_encodings.append(proposed_encoding)
2547
data = _toUTF8(data, proposed_encoding)
2548
known_encoding = use_strict_parser = 1
2552
# if no luck and we have auto-detection library, try that
2553
if (not known_encoding) and chardet:
2555
proposed_encoding = chardet.detect(data)['encoding']
2556
if proposed_encoding and (proposed_encoding not in tried_encodings):
2557
tried_encodings.append(proposed_encoding)
2558
data = _toUTF8(data, proposed_encoding)
2559
known_encoding = use_strict_parser = 1
2562
# if still no luck and we haven't tried utf-8 yet, try that
2563
if (not known_encoding) and ('utf-8' not in tried_encodings):
2565
proposed_encoding = 'utf-8'
2566
tried_encodings.append(proposed_encoding)
2567
data = _toUTF8(data, proposed_encoding)
2568
known_encoding = use_strict_parser = 1
2571
# if still no luck and we haven't tried windows-1252 yet, try that
2572
if (not known_encoding) and ('windows-1252' not in tried_encodings):
2574
proposed_encoding = 'windows-1252'
2575
tried_encodings.append(proposed_encoding)
2576
data = _toUTF8(data, proposed_encoding)
2577
known_encoding = use_strict_parser = 1
2580
# if still no luck, give up
2581
if not known_encoding:
2583
result['bozo_exception'] = CharacterEncodingUnknown( \
2584
'document encoding unknown, I tried ' + \
2585
'%s, %s, utf-8, and windows-1252 but nothing worked' % \
2586
(result['encoding'], xml_encoding))
2587
result['encoding'] = ''
2588
elif proposed_encoding != result['encoding']:
2590
result['bozo_exception'] = CharacterEncodingOverride( \
2591
'documented declared as %s, but parsed as %s' % \
2592
(result['encoding'], proposed_encoding))
2593
result['encoding'] = proposed_encoding
2595
if not _XML_AVAILABLE:
2596
use_strict_parser = 0
2597
if use_strict_parser:
2598
# initialize the SAX parser
2599
feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')
2600
saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)
2601
saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)
2602
saxparser.setContentHandler(feedparser)
2603
saxparser.setErrorHandler(feedparser)
2604
source = xml.sax.xmlreader.InputSource()
2605
source.setByteStream(_StringIO(data))
2606
if hasattr(saxparser, '_ns_stack'):
2607
# work around bug in built-in SAX parser (doesn't recognize xml: namespace)
2608
# PyXML doesn't have this problem, and it doesn't have _ns_stack either
2609
saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})
2611
saxparser.parse(source)
2612
except Exception, e:
2615
traceback.print_stack()
2616
traceback.print_exc()
2617
sys.stderr.write('xml parsing failed\n')
2619
result['bozo_exception'] = feedparser.exc or e
2620
use_strict_parser = 0
2621
if not use_strict_parser:
2622
feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '')
2623
feedparser.feed(data)
2624
result['feed'] = feedparser.feeddata
2625
result['entries'] = feedparser.entries
2626
result['version'] = result['version'] or feedparser.version
2627
result['namespaces'] = feedparser.namespacesInUse
2630
if __name__ == '__main__':
2631
if not sys.argv[1:]:
2636
zopeCompatibilityHack()
2637
from pprint import pprint
2646
#1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
2647
# added Simon Fell's test suite
2648
#1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections
2650
# JD - use inchannel to watch out for image and textinput elements which can
2651
# also contain title, link, and description elements
2652
# JD - check for isPermaLink='false' attribute on guid elements
2653
# JD - replaced openAnything with open_resource supporting ETag and
2654
# If-Modified-Since request headers
2655
# JD - parse now accepts etag, modified, agent, and referrer optional
2657
# JD - modified parse to return a dictionary instead of a tuple so that any
2658
# etag or modified information can be returned and cached by the caller
2659
#2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything
2660
# because of etag/modified, return the old etag/modified to the caller to
2661
# indicate why nothing is being returned
2662
#2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its
2663
# useless. Fixes the problem JD was addressing by adding it.
2664
#2.1 - 11/14/2002 - MAP - added gzip support
2665
#2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
2666
# start_admingeneratoragent is an example of how to handle elements with
2667
# only attributes, no content.
2668
#2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
2669
# also, make sure we send the User-Agent even if urllib2 isn't available.
2670
# Match any variation of backend.userland.com/rss namespace.
2671
#2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
2672
#2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
2673
# snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed
2675
#2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
2676
# removed unnecessary urllib code -- urllib2 should always be available anyway;
2677
# return actual url, status, and full HTTP headers (as result['url'],
2678
# result['status'], and result['headers']) if parsing a remote feed over HTTP --
2679
# this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;
2680
# added the latest namespace-of-the-week for RSS 2.0
2681
#2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
2682
# User-Agent (otherwise urllib2 sends two, which confuses some servers)
2683
#2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
2684
# inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds
2685
#2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
2686
# textInput, and also to return the character encoding (if specified)
2687
#2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking
2688
# nested divs within content (JohnD); fixed missing sys import (JohanS);
2689
# fixed regular expression to capture XML character encoding (Andrei);
2690
# added support for Atom 0.3-style links; fixed bug with textInput tracking;
2691
# added support for cloud (MartijnP); added support for multiple
2692
# category/dc:subject (MartijnP); normalize content model: 'description' gets
2693
# description (which can come from description, summary, or full content if no
2694
# description), 'content' gets dict of base/language/type/value (which can come
2695
# from content:encoded, xhtml:body, content, or fullitem);
2696
# fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang
2697
# tracking; fixed bug tracking unknown tags; fixed bug tracking content when
2698
# <content> element is not in default namespace (like Pocketsoap feed);
2699
# resolve relative URLs in link, guid, docs, url, comments, wfw:comment,
2700
# wfw:commentRSS; resolve relative URLs within embedded HTML markup in
2701
# description, xhtml:body, content, content:encoded, title, subtitle,
2702
# summary, info, tagline, and copyright; added support for pingback and
2703
# trackback namespaces
2704
#2.7 - 1/5/2004 - MAP - really added support for trackback and pingback
2705
# namespaces, as opposed to 2.6 when I said I did but didn't really;
2706
# sanitize HTML markup within some elements; added mxTidy support (if
2707
# installed) to tidy HTML markup within some elements; fixed indentation
2708
# bug in _parse_date (FazalM); use socket.setdefaulttimeout if available
2709
# (FazalM); universal date parsing and normalization (FazalM): 'created', modified',
2710
# 'issued' are parsed into 9-tuple date format and stored in 'created_parsed',
2711
# 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'
2712
# and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa
2713
#2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory
2714
# leak not closing url opener (JohnD); added dc:publisher support (MarekK);
2715
# added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)
2716
#2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in
2717
# encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);
2718
# fixed relative URI processing for guid (skadz); added ICBM support; added
2720
#2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
2721
# blogspot.com sites); added _debug variable
2722
#2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
2723
#3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);
2724
# added several new supported namespaces; fixed bug tracking naked markup in
2725
# description; added support for enclosure; added support for source; re-added
2726
# support for cloud which got dropped somehow; added support for expirationDate
2727
#3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking
2728
# xml:base URI, one for documents that don't define one explicitly and one for
2729
# documents that define an outer and an inner xml:base that goes out of scope
2730
# before the end of the document
2731
#3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level
2732
#3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version']
2733
# will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;
2734
# added support for creativeCommons:license and cc:license; added support for
2735
# full Atom content model in title, tagline, info, copyright, summary; fixed bug
2736
# with gzip encoding (not always telling server we support it when we do)
2737
#3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail
2738
# (dictionary of 'name', 'url', 'email'); map author to author_detail if author
2739
# contains name + email address
2740
#3.0b8 - 1/28/2004 - MAP - added support for contributor
2741
#3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added
2742
# support for summary
2743
#3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from
2745
#3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain
2746
# dangerous markup; fiddled with decodeEntities (not right); liberalized
2747
# date parsing even further
2748
#3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);
2749
# added support to Atom 0.2 subtitle; added support for Atom content model
2750
# in copyright; better sanitizing of dangerous HTML elements with end tags
2751
# (script, frameset)
2752
#3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,
2753
# etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />)
2754
#3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under
2756
#3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;
2757
# fixed bug capturing author and contributor URL; fixed bug resolving relative
2758
# links in author and contributor URL; fixed bug resolvin relative links in
2759
# generator URL; added support for recognizing RSS 1.0; passed Simon Fell's
2760
# namespace tests, and included them permanently in the test suite with his
2761
# permission; fixed namespace handling under Python 2.1
2762
#3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)
2763
#3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023
2764
#3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);
2765
# use libxml2 (if available)
2766
#3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author
2767
# name was in parentheses; removed ultra-problematic mxTidy support; patch to
2768
# workaround crash in PyXML/expat when encountering invalid entities
2769
# (MarkMoraes); support for textinput/textInput
2770
#3.0b20 - 4/7/2004 - MAP - added CDF support
2771
#3.0b21 - 4/14/2004 - MAP - added Hot RSS support
2772
#3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in
2773
# results dict; changed results dict to allow getting values with results.key
2774
# as well as results[key]; work around embedded illformed HTML with half
2775
# a DOCTYPE; work around malformed Content-Type header; if character encoding
2776
# is wrong, try several common ones before falling back to regexes (if this
2777
# works, bozo_exception is set to CharacterEncodingOverride); fixed character
2778
# encoding issues in BaseHTMLProcessor by tracking encoding and converting
2779
# from Unicode to raw strings before feeding data to sgmllib.SGMLParser;
2780
# convert each value in results to Unicode (if possible), even if using
2781
# regex-based parsing
2782
#3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain
2783
# high-bit characters in attributes in embedded HTML in description (thanks
2784
# Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in
2785
# FeedParserDict; tweaked FeedParserDict.has_key to return True if asking
2786
# about a mapped key
2787
#3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and
2788
# results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could
2789
# cause the same encoding to be tried twice (even if it failed the first time);
2790
# fixed DOCTYPE stripping when DOCTYPE contained entity declarations;
2791
# better textinput and image tracking in illformed RSS 1.0 feeds
2792
#3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed
2793
# my blink tag tests
2794
#3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that
2795
# failed to parse utf-16 encoded feeds; made source into a FeedParserDict;
2796
# duplicate admin:generatorAgent/@rdf:resource in generator_detail.url;
2797
# added support for image; refactored parse() fallback logic to try other
2798
# encodings if SAX parsing fails (previously it would only try other encodings
2799
# if re-encoding failed); remove unichr madness in normalize_attrs now that
2800
# we're properly tracking encoding in and out of BaseHTMLProcessor; set
2801
# feed.language from root-level xml:lang; set entry.id from rdf:about;
2802
# send Accept header
2803
#3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between
2804
# iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are
2805
# windows-1252); fixed regression that could cause the same encoding to be
2806
# tried twice (even if it failed the first time)
2807
#3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types;
2808
# recover from malformed content-type header parameter with no equals sign
2809
# ('text/xml; charset:iso-8859-1')
2810
#3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities
2811
# to Unicode equivalents in illformed feeds (aaronsw); added and
2812
# passed tests for converting character entities to Unicode equivalents
2813
# in illformed feeds (aaronsw); test for valid parsers when setting
2814
# XML_AVAILABLE; make version and encoding available when server returns
2815
# a 304; add handlers parameter to pass arbitrary urllib2 handlers (like
2816
# digest auth or proxy support); add code to parse username/password
2817
# out of url and send as basic authentication; expose downloading-related
2818
# exceptions in bozo_exception (aaronsw); added __contains__ method to
2819
# FeedParserDict (aaronsw); added publisher_detail (aaronsw)
2820
#3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always
2821
# convert feed to UTF-8 before passing to XML parser; completely revamped
2822
# logic for determining character encoding and attempting XML parsing
2823
# (much faster); increased default timeout to 20 seconds; test for presence
2824
# of Location header on redirects; added tests for many alternate character
2825
# encodings; support various EBCDIC encodings; support UTF-16BE and
2826
# UTF16-LE with or without a BOM; support UTF-8 with a BOM; support
2827
# UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no
2828
# XML parsers are available; added support for 'Content-encoding: deflate';
2829
# send blank 'Accept-encoding: ' header if neither gzip nor zlib modules
2831
#3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure
2832
# problem tracking xml:base and xml:lang if element declares it, child
2833
# doesn't, first grandchild redeclares it, and second grandchild doesn't;
2834
# refactored date parsing; defined public registerDateHandler so callers
2835
# can add support for additional date formats at runtime; added support
2836
# for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added
2837
# zopeCompatibilityHack() which turns FeedParserDict into a regular
2838
# dictionary, required for Zope compatibility, and also makes command-
2839
# line debugging easier because pprint module formats real dictionaries
2840
# better than dictionary-like objects; added NonXMLContentType exception,
2841
# which is stored in bozo_exception when a feed is served with a non-XML
2842
# media type such as 'text/plain'; respect Content-Language as default
2843
# language if not xml:lang is present; cloud dict is now FeedParserDict;
2844
# generator dict is now FeedParserDict; better tracking of xml:lang,
2845
# including support for xml:lang='' to unset the current language;
2846
# recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default
2847
# namespace; don't overwrite final status on redirects (scenarios:
2848
# redirecting to a URL that returns 304, redirecting to a URL that
2849
# redirects to another URL with a different type of redirect); add
2850
# support for HTTP 303 redirects
2851
#4.0 - MAP - support for relative URIs in xml:base attribute; fixed
2852
# encoding issue with mxTidy (phopkins); preliminary support for RFC 3229;
2853
# support for Atom 1.0; support for iTunes extensions; new 'tags' for
2854
# categories/keywords/etc. as array of dict
2855
# {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0
2856
# terminology; parse RFC 822-style dates with no time; lots of other
2858
#4.1 - MAP - removed socket timeout; added support for chardet library