~ubuntu-branches/ubuntu/natty/miro/natty

emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author)

1078

if not emailmatch: return

1079

email = emailmatch.group(0)

1080

# probably a better way to do the following, but it passes all the tests

1081

author = author.replace(email, '')

1082

author = author.replace('()', '')

1083

author = author.strip()

1084

if author and (author[0] == '('):

1085

author = author[1:]

1086

if author and (author[-1] == ')'):

1087

author = author[:-1]

1088

author = author.strip()

1089

context.setdefault('%s_detail' % key, FeedParserDict())

1090

context['%s_detail' % key]['name'] = author

1091

context['%s_detail' % key]['email'] = email

1092

1093

def _start_subtitle(self, attrsD):

1094

self.pushContent('subtitle', attrsD, 'text/plain', 1)

1095

_start_tagline = _start_subtitle

1096

_start_itunes_subtitle = _start_subtitle

1097

1098

def _end_subtitle(self):

1099

self.popContent('subtitle')

1100

_end_tagline = _end_subtitle

1101

_end_itunes_subtitle = _end_subtitle

1102

1103

def _start_rights(self, attrsD):

1104

self.pushContent('rights', attrsD, 'text/plain', 1)

1105

_start_dc_rights = _start_rights

1106

_start_copyright = _start_rights

1107

1108

def _end_rights(self):

1109

self.popContent('rights')

1110

_end_dc_rights = _end_rights

1111

_end_copyright = _end_rights

1112

1113

def _start_item(self, attrsD):

1114

self.entries.append(FeedParserDict())

1115

self.push('item', 0)

1116

self.inentry = 1

1117

self.guidislink = 0

1118

id = self._getAttribute(attrsD, 'rdf:about')

1119

if id:

1120

context = self._getContext()

1121

context['id'] = id

1122

self._cdf_common(attrsD)

1123

_start_entry = _start_item

1124

_start_product = _start_item

1125

1126

def _end_item(self):

1127

self.pop('item')

1128

self.inentry = 0

1129

_end_entry = _end_item

1130

1131

def _start_dc_language(self, attrsD):

1132

self.push('language', 1)

1133

_start_language = _start_dc_language

1134

1135

def _end_dc_language(self):

1136

self.lang = self.pop('language')

1137

_end_language = _end_dc_language

1138

1139

def _start_dc_publisher(self, attrsD):

1140

self.push('publisher', 1)

1141

_start_webmaster = _start_dc_publisher

1142

1143

def _end_dc_publisher(self):

1144

self.pop('publisher')

1145

self._sync_author_detail('publisher')

1146

_end_webmaster = _end_dc_publisher

1147

1148

def _start_published(self, attrsD):

1149

self.push('published', 1)

1150

_start_dcterms_issued = _start_published

1151

_start_issued = _start_published

1152

1153

def _end_published(self):

1154

value = self.pop('published')

1155

self._save('published_parsed', _parse_date(value))

1156

_end_dcterms_issued = _end_published

1157

_end_issued = _end_published

1158

1159

def _start_updated(self, attrsD):

1160

self.push('updated', 1)

1161

_start_modified = _start_updated

1162

_start_dcterms_modified = _start_updated

1163

_start_pubdate = _start_updated

1164

_start_dc_date = _start_updated

1165

1166

def _end_updated(self):

1167

value = self.pop('updated')

1168

parsed_value = _parse_date(value)

1169

self._save('updated_parsed', parsed_value)

1170

_end_modified = _end_updated

1171

_end_dcterms_modified = _end_updated

1172

_end_pubdate = _end_updated

1173

_end_dc_date = _end_updated

1174

1175

def _start_created(self, attrsD):

1176

self.push('created', 1)

1177

_start_dcterms_created = _start_created

1178

1179

def _end_created(self):

1180

value = self.pop('created')

1181

self._save('created_parsed', _parse_date(value))

1182

_end_dcterms_created = _end_created

1183

1184

def _start_expirationdate(self, attrsD):

1185

self.push('expired', 1)

1186

1187

def _end_expirationdate(self):

1188

self._save('expired_parsed', _parse_date(self.pop('expired')))

1189

1190

def _start_cc_license(self, attrsD):

1191

self.push('license', 1)

1192

value = self._getAttribute(attrsD, 'rdf:resource')

1193

if value:

1194

self.elementstack[-1][2].append(value)

1195

self.pop('license')

1196

1197

def _start_creativecommons_license(self, attrsD):

1198

self.push('license', 1)

1199

1200

def _end_creativecommons_license(self):

1201

self.pop('license')

1202

1203

def _addTag(self, term, scheme, label):

1204

context = self._getContext()

1205

tags = context.setdefault('tags', [])

1206

if (not term) and (not scheme) and (not label): return

1207

value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})

1208

if value not in tags:

1209

tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label}))

1210

1211

def _start_category(self, attrsD):

1212

if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))

1213

term = attrsD.get('term')

1214

scheme = attrsD.get('scheme', attrsD.get('domain'))

1215

label = attrsD.get('label')

1216

self._addTag(term, scheme, label)

1217

self.push('category', 1)

1218

_start_dc_subject = _start_category

1219

_start_keywords = _start_category

1220

_start_media_category = _start_category

1221

1222

def _end_itunes_keywords(self):

1223

for term in self.pop('itunes_keywords').split():

1224

self._addTag(term, 'http://www.itunes.com/', None)

1225

1226

def _start_itunes_category(self, attrsD):

1227

self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)

1228

self.push('category', 1)

1229

1230

def _end_category(self):

1231

value = self.pop('category')

1232

if not value: return

1233

context = self._getContext()

1234

tags = context['tags']

1235

if value and len(tags) and not tags[-1]['term']:

1236

tags[-1]['term'] = value

1237

else:

1238

self._addTag(value, None, None)

1239

_end_dc_subject = _end_category

1240

_end_keywords = _end_category

1241

_end_itunes_category = _end_category

1242

_end_media_category = _end_category

1243

1244

def _start_cloud(self, attrsD):

1245

self._getContext()['cloud'] = FeedParserDict(attrsD)

1246

1247

def _start_link(self, attrsD):

1248

attrsD.setdefault('rel', 'alternate')

1249

attrsD.setdefault('type', 'text/html')

1250

attrsD = self._itsAnHrefDamnIt(attrsD)

1251

if attrsD.has_key('href'):

1252

attrsD['href'] = self.resolveURI(attrsD['href'])

1253

expectingText = self.infeed or self.inentry or self.insource

1254

context = self._getContext()

1255

context.setdefault('links', [])

1256

context['links'].append(FeedParserDict(attrsD))

1257

if attrsD['rel'] == 'enclosure':

1258

self._start_enclosure(attrsD)

1259

if attrsD.has_key('href'):

1260

expectingText = 0

1261

if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):

1262

context['link'] = attrsD['href']

1263

else:

1264

self.push('link', expectingText)

1265

_start_producturl = _start_link

1266

1267

def _end_link(self):

1268

value = self.pop('link')

1269

context = self._getContext()

1270

if self.intextinput:

1271

context['textinput']['link'] = value

1272

if self.inimage:

1273

context['image']['link'] = value

1274

_end_producturl = _end_link

1275

1276

def _start_guid(self, attrsD):

1277

self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')

1278

self.push('id', 1)

1279

1280

def _end_guid(self):

1281

value = self.pop('id')

1282

self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))

1283

if self.guidislink:

1284

# guid acts as link, but only if 'ispermalink' is not present or is 'true',

1285

# and only if the item doesn't already have a link element

1286

self._save('link', value)

1287

1288

def _start_title(self, attrsD):

1289

self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)

1290

_start_dc_title = _start_title

1291

_start_media_title = _start_title

1292

1293

def _end_title(self):

1294

value = self.popContent('title')

1295

context = self._getContext()

1296

if self.intextinput:

1297

context['textinput']['title'] = value

1298

elif self.inimage:

1299

context['image']['title'] = value

1300

_end_dc_title = _end_title

1301

_end_media_title = _end_title

1302

1303

def _start_description(self, attrsD):

1304

context = self._getContext()

1305

if context.has_key('summary'):

1306

self._summaryKey = 'content'

1307

self._start_content(attrsD)

1308

else:

1309

self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)

1310

1311

def _start_abstract(self, attrsD):

1312

self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)

1313

1314

def _end_description(self):

1315

if self._summaryKey == 'content':

1316

self._end_content()

1317

else:

1318

value = self.popContent('description')

1319

context = self._getContext()

1320

if self.intextinput:

1321

context['textinput']['description'] = value

1322

elif self.inimage:

1323

context['image']['description'] = value

1324

self._summaryKey = None

1325

_end_abstract = _end_description

1326

1327

def _start_info(self, attrsD):

1328

self.pushContent('info', attrsD, 'text/plain', 1)

1329

_start_feedburner_browserfriendly = _start_info

1330

1331

def _end_info(self):

1332

self.popContent('info')

1333

_end_feedburner_browserfriendly = _end_info

1334

1335

def _start_generator(self, attrsD):

1336

if attrsD:

1337

attrsD = self._itsAnHrefDamnIt(attrsD)

1338

if attrsD.has_key('href'):

1339

attrsD['href'] = self.resolveURI(attrsD['href'])

1340

self._getContext()['generator_detail'] = FeedParserDict(attrsD)

1341

self.push('generator', 1)

1342

1343

def _end_generator(self):

1344

value = self.pop('generator')

1345

context = self._getContext()

1346

if context.has_key('generator_detail'):

1347

context['generator_detail']['name'] = value

1348

1349

def _start_admin_generatoragent(self, attrsD):

1350

self.push('generator', 1)

1351

value = self._getAttribute(attrsD, 'rdf:resource')

1352

if value:

1353

self.elementstack[-1][2].append(value)

1354

self.pop('generator')

1355

self._getContext()['generator_detail'] = FeedParserDict({'href': value})

1356

1357

def _start_admin_errorreportsto(self, attrsD):

1358

self.push('errorreportsto', 1)

1359

value = self._getAttribute(attrsD, 'rdf:resource')

1360

if value:

1361

self.elementstack[-1][2].append(value)

1362

self.pop('errorreportsto')

1363

1364

def _start_summary(self, attrsD):

1365

context = self._getContext()

1366

if context.has_key('summary'):

1367

self._summaryKey = 'content'

1368

self._start_content(attrsD)

1369

else:

1370

self._summaryKey = 'summary'

1371

self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)

1372

_start_itunes_summary = _start_summary

1373

1374

def _end_summary(self):

1375

if self._summaryKey == 'content':

1376

self._end_content()

1377

else:

1378

self.popContent(self._summaryKey or 'summary')

1379

self._summaryKey = None

1380

_end_itunes_summary = _end_summary

1381

1382

def _start_enclosure(self, attrsD):

1383

self.inenclosure += 1

1384

attrsD = self._itsAnHrefDamnIt(attrsD)

1385

self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD))

1386

_start_media_content = _start_enclosure

1387

1388

def _end_enclosure(self):

1389

self.inenclosure -= 1

1390

_end_media_content = _end_enclosure

1391

1392

def _start_media_thumbnail(self,attrsD):

1393

self.push('media:thumbnail',1)

1394

if self.inentry:

1395

if self.inenclosure:

1396

self.entries[-1]['enclosures'][-1]['thumbnail']=FeedParserDict(attrsD)

1397

else:

1398

self.entries[-1]['thumbnail'] = FeedParserDict(attrsD)

1399

1400

def _end_media_thumbnail(self):

1401

self.pop('media:thumbnail')

1402

1403

def _start_media_text(self,attrsD):

1404

self.push('media:text',1)

1405

1406

def _end_media_text(self):

1407

value = self.pop('media:text')

1408

if self.inentry:

1409

if self.inenclosure:

1410

self.entries[-1]['enclosures'][-1]['text'] = value

1411

else:

1412

self.entries[-1]['text'] = value

1413

1414

def _start_media_people(self,attrsD):

1415

self.push('media:people',1)

1416

try:

1417

self.peoplerole = attrsD['role']

1418

except (SystemExit, KeyboardInterrupt):

1419

raise

1420

except:

1421

self.peoplerole = 'unknown'

1422

1423

def _end_media_people(self):

1424

value = self.pop('media:people').split('|')

1425

if self.inentry:

1426

if self.inenclosure:

1427

self.entries[-1]['enclosures'][-1].setdefault('roles', {})

1428

self.entries[-1]['enclosures'][-1].roles[self.peoplerole]=value

1429

else:

1430

self.entries[-1].setdefault('roles', {})

1431

self.entries[-1].roles[self.peoplerole]=value

1432

1433

def _start_dtv_startnback(self,attrsD):

1434

self.push('dtv:startnback',1)

1435

1436

def _end_dtv_startnback(self):

1437

self.feeddata['startnback'] = self.pop('dtv:startnback')

1438

1439

def _start_dtv_librarylink(self,attrsD):

1440

self.push('dtv:librarylink',1)

1441

1442

def _end_dtv_librarylink(self):

1443

self.feeddata['librarylink'] = self.pop('dtv:librarylink')

1444

1445

def _start_dtv_releasedate(self,attrsD):

1446

self.push('dtv:releasedate',1)

1447

1448

def _end_dtv_releasedate(self):

1449

value = self.pop('dtv:releasedate')

1450

if self.inentry:

1451

if self.inenclosure:

1452

self.entries[-1]['enclosures'][-1]['releasedate'] = value

1453

self.entries[-1]['enclosures'][-1]['releasedate_parsed'] = _parse_date(value)

1454

else:

1455

self.entries[-1]['releasedate'] = value

1456

self.entries[-1]['releasedate_parsed'] = _parse_date(value)

1457

1458

def _start_dtv_paymentlink(self,attrsD):

1459

self.incontent += 1

1460

self.contentparams['mode'] = 'xml'

1461

self.contentparams['type'] = 'application/xhtml+xml'

1462

self.push('dtv:paymentlink',1)

1463

if self.inentry:

1464

if attrsD.has_key('url'):

1465

if self.inenclosure:

1466

self.entries[-1]['enclosures'][-1]['payment_url'] = attrsD['url']

1467

else:

1468

self.entries[-1]['payment_url'] = attrsD['url']

1469

1470

def _end_dtv_paymentlink(self):

1471

value = sanitizeHTML(self.pop('dtv:paymentlink'),self.encoding)

1472

self.incontent -= 1

1473

self.contentparams.clear()

1474

if self.inentry:

1475

if self.inenclosure:

1476

self.entries[-1]['enclosures'][-1]['payment_html'] = value

1477

else:

1478

self.entries[-1]['payment_html'] = value

1479

1480

def _start_source(self, attrsD):

1481

self.insource = 1

1482

1483

def _end_source(self):

1484

self.insource = 0

1485

self._getContext()['source'] = copy.deepcopy(self.sourcedata)

1486

self.sourcedata.clear()

1487

1488

def _start_content(self, attrsD):

1489

self.pushContent('content', attrsD, 'text/plain', 1)

1490

src = attrsD.get('src')

1491

if src:

1492

self.contentparams['src'] = src

1493

self.push('content', 1)

1494

1495

def _start_prodlink(self, attrsD):

1496

self.pushContent('content', attrsD, 'text/html', 1)

1497

1498

def _start_body(self, attrsD):

1499

self.pushContent('content', attrsD, 'application/xhtml+xml', 1)

1500

_start_xhtml_body = _start_body

1501

1502

def _start_content_encoded(self, attrsD):

1503

self.pushContent('content', attrsD, 'text/html', 1)

1504

_start_fullitem = _start_content_encoded

1505

1506

def _end_content(self):

1507

copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)

1508

value = self.popContent('content')

1509

if copyToDescription:

1510

self._save('description', value)

1511

_end_body = _end_content

1512

_end_xhtml_body = _end_content

1513

_end_content_encoded = _end_content

1514

_end_fullitem = _end_content

1515

_end_prodlink = _end_content

1516

1517

def _start_itunes_image(self, attrsD):

1518

self.push('itunes_image', 0)

1519

self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})

1520

1521

def _start_itunes_link(self, attrsD):

1522

self.push('itunes_link', 0)

1523

self._getContext()['link'] = FeedParserDict({'href': attrsD.get('href')})

1524

1525

def _end_itunes_block(self):

1526

value = self.pop('itunes_block', 0)

1527

self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0

1528

1529

def _end_itunes_explicit(self):

1530

value = self.pop('itunes_explicit', 0)

1531

self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0

1532

1533

if _XML_AVAILABLE:

1534

class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):

1535

def __init__(self, baseuri, baselang, encoding):

1536

if _debug: sys.stderr.write('trying StrictFeedParser\n')

1537

xml.sax.handler.ContentHandler.__init__(self)

1538

_FeedParserMixin.__init__(self, baseuri, baselang, encoding)

1539

self.bozo = 0

1540

self.exc = None

1541

1542

def startPrefixMapping(self, prefix, uri):

1543

self.trackNamespace(prefix, uri)

1544

1545

def startElementNS(self, name, qname, attrs):

1546

namespace, localname = name

1547

lowernamespace = str(namespace or '').lower()

1548

if lowernamespace.find('backend.userland.com/rss') <> -1:

1549

# match any backend.userland.com namespace

1550

namespace = 'http://backend.userland.com/rss'

1551

lowernamespace = namespace

1552

if qname and qname.find(':') > 0:

1553

givenprefix = qname.split(':')[0]

1554

else:

1555

givenprefix = None

1556

prefix = self._matchnamespaces.get(lowernamespace, givenprefix)

1557

if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):

1558

raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix

1559

if prefix:

1560

localname = prefix + ':' + localname

1561

localname = str(localname).lower()

1562

if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))

1563

1564

# qname implementation is horribly broken in Python 2.1 (it

1565

# doesn't report any), and slightly broken in Python 2.2 (it

1566

# doesn't report the xml: namespace). So we match up namespaces

1567

# with a known list first, and then possibly override them with

1568

# the qnames the SAX parser gives us (if indeed it gives us any

1569

# at all). Thanks to MatejC for helping me test this and

1570

# tirelessly telling me that it didn't work yet.

1571

attrsD = {}

1572

for (namespace, attrlocalname), attrvalue in attrs._attrs.items():

1573

lowernamespace = (namespace or '').lower()

1574

prefix = self._matchnamespaces.get(lowernamespace, '')

1575

if prefix:

1576

attrlocalname = prefix + ':' + attrlocalname

1577

attrsD[str(attrlocalname).lower()] = attrvalue

1578

for qname in attrs.getQNames():

1579

attrsD[str(qname).lower()] = attrs.getValueByQName(qname)

1580

self.unknown_starttag(localname, attrsD.items())

1581

1582

def characters(self, text):

1583

self.handle_data(text)

1584

1585

def endElementNS(self, name, qname):

1586

namespace, localname = name

1587

lowernamespace = str(namespace or '').lower()

1588

if qname and qname.find(':') > 0:

1589

givenprefix = qname.split(':')[0]

1590

else:

1591

givenprefix = ''

1592

prefix = self._matchnamespaces.get(lowernamespace, givenprefix)

1593

if prefix:

1594

localname = prefix + ':' + localname

1595

localname = str(localname).lower()

1596

self.unknown_endtag(localname)

1597

1598

def error(self, exc):

1599

self.bozo = 1

1600

self.exc = exc

1601

1602

def fatalError(self, exc):

1603

self.error(exc)

1604

raise exc

1605

1606

class _BaseHTMLProcessor(sgmllib.SGMLParser):

1607

elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',

1608

'img', 'input', 'isindex', 'link', 'meta', 'param']

1609

1610

def __init__(self, encoding):

1611

self.encoding = encoding

1612

if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)

1613

sgmllib.SGMLParser.__init__(self)

1614

1615

def reset(self):

1616

self.pieces = []

1617

sgmllib.SGMLParser.reset(self)

1618

1619

def _shorttag_replace(self, match):

1620

tag = match.group(1)

1621

if tag in self.elements_no_end_tag:

1622

return '<' + tag + ' />'

1623

else:

1624

return '<' + tag + '></' + tag + '>'

1625

1626

def feed(self, data):

1627

data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)

1628

#data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace

1629

data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data)

1630

data = data.replace(''', "'")

1631

data = data.replace('"', '"')

1632

if self.encoding and type(data) == type(u''):

1633

data = data.encode(self.encoding)

1634

sgmllib.SGMLParser.feed(self, data)

1635

1636

def normalize_attrs(self, attrs):

1637

# utility method to be called by descendants

1638

attrs = [(k.lower(), v) for k, v in attrs]

1639

attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]

1640

return attrs

1641

1642

def parse_starttag(self, i):

1643

retval = sgmllib.SGMLParser.parse_starttag(self, i)

1644

try:

1645

if self.get_starttag_text()[-2:] == "/>":

1646

self.finish_endtag(self.lasttag)

1647

except (SystemExit, KeyboardInterrupt):

1648

raise

1649

except:

1650

pass

1651

return retval

1652

1653

def unknown_starttag(self, tag, attrs):

1654

# called for each start tag

1655

# attrs is a list of (attr, value) tuples

1656

# e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]

1657

if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)

1658

uattrs = []

1659

# thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds

1660

for key, value in attrs:

1661

if type(value) != type(u''):

1662

value = unicode(value, self.encoding)

1663

uattrs.append((unicode(key, self.encoding), value))

1664

strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)

1665

if tag in self.elements_no_end_tag:

1666

self.pieces.append('<%(tag)s%(strattrs)s />' % locals())

1667

else:

1668

self.pieces.append('<%(tag)s%(strattrs)s>' % locals())

1669

1670

def unknown_endtag(self, tag):

1671

# called for each end tag, e.g. for </pre>, tag will be 'pre'

1672

# Reconstruct the original end tag.

1673

if tag not in self.elements_no_end_tag:

1674

self.pieces.append("</%(tag)s>" % locals())

1675

1676

def handle_charref(self, ref):

1677

# called for each character reference, e.g. for ' ', ref will be '160'

1678

# Reconstruct the original character reference.

1679

self.pieces.append('&#%(ref)s;' % locals())

1680

1681

def handle_entityref(self, ref):

1682

# called for each entity reference, e.g. for '©', ref will be 'copy'

1683

# Reconstruct the original entity reference.

1684

self.pieces.append('&%(ref)s;' % locals())

1685

1686

def handle_data(self, text):

1687

# called for each block of plain text, i.e. outside of any tag and

1688

# not containing any character or entity references

1689

# Store the original text verbatim.

1690

if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)

1691

self.pieces.append(text)

1692

1693

def handle_comment(self, text):

1694

# called for each HTML comment, e.g.

1695

# Reconstruct the original comment.

1696

self.pieces.append('' % locals())

1697

1698

def handle_pi(self, text):

1699

# called for each processing instruction, e.g. <?instruction>

1700

# Reconstruct original processing instruction.

1701

self.pieces.append('<?%(text)s>' % locals())

1702

1703

def handle_decl(self, text):

1704

# called for the DOCTYPE, if present, e.g.

1705

# <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"

1706

# "http://www.w3.org/TR/html4/loose.dtd">

1707

# Reconstruct original DOCTYPE

1708

self.pieces.append('<!%(text)s>' % locals())

1709

1710

_new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match

1711

def _scan_name(self, i, declstartpos):

1712

rawdata = self.rawdata

1713

n = len(rawdata)

1714

if i == n:

1715

return None, -1

1716

m = self._new_declname_match(rawdata, i)

1717

if m:

1718

s = m.group()

1719

name = s.strip()

1720

if (i + len(s)) == n:

1721

return None, -1 # end of buffer

1722

return name.lower(), m.end()

1723

else:

1724

self.handle_data(rawdata)

1725

# self.updatepos(declstartpos, i)

1726

return None, -1

1727

1728

def output(self):

1729

'''Return processed HTML as a single string'''

1730

return ''.join([str(p) for p in self.pieces])

1731

1732

class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):

1733

def __init__(self, baseuri, baselang, encoding):

1734

sgmllib.SGMLParser.__init__(self)

1735

_FeedParserMixin.__init__(self, baseuri, baselang, encoding)

1736

1737

def decodeEntities(self, element, data):

1738

data = data.replace('<', '<')

1739

data = data.replace('<', '<')

1740

data = data.replace('>', '>')

1741

data = data.replace('>', '>')

1742

data = data.replace('&', '&')

1743

data = data.replace('&', '&')

1744

data = data.replace('"', '"')

1745

data = data.replace('"', '"')

1746

data = data.replace(''', ''')

1747

data = data.replace(''', ''')

1748

if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):

1749

data = data.replace('<', '<')

1750

data = data.replace('>', '>')

1751

data = data.replace('&', '&')

1752

data = data.replace('"', '"')

1753

data = data.replace(''', "'")

1754

return data

1755

1756

class _RelativeURIResolver(_BaseHTMLProcessor):

1757

relative_uris = [('a', 'href'),

1758

('applet', 'codebase'),

1759

('area', 'href'),

1760

('blockquote', 'cite'),

1761

('body', 'background'),

1762

('del', 'cite'),

1763

('form', 'action'),

1764

('frame', 'longdesc'),

1765

('frame', 'src'),

1766

('iframe', 'longdesc'),

1767

('iframe', 'src'),

1768

('head', 'profile'),

1769

('img', 'longdesc'),

1770

('img', 'src'),

1771

('img', 'usemap'),

1772

('input', 'src'),

1773

('input', 'usemap'),

1774

('ins', 'cite'),

1775

('link', 'href'),

1776

('object', 'classid'),

1777

('object', 'codebase'),

1778

('object', 'data'),

1779

('object', 'usemap'),

1780

('q', 'cite'),

1781

('script', 'src')]

1782

1783

def __init__(self, baseuri, encoding):

1784

_BaseHTMLProcessor.__init__(self, encoding)

1785

self.baseuri = baseuri

1786

1787

def resolveURI(self, uri):

1788

return _urljoin(self.baseuri, uri)

1789

1790

def unknown_starttag(self, tag, attrs):

1791

attrs = self.normalize_attrs(attrs)

1792

attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]

1793

_BaseHTMLProcessor.unknown_starttag(self, tag, attrs)

1794

1795

def _resolveRelativeURIs(htmlSource, baseURI, encoding):

1796

if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')

1797

p = _RelativeURIResolver(baseURI, encoding)

1798

p.feed(htmlSource)

1799

return p.output()

1800

1801

class _HTMLSanitizer(_BaseHTMLProcessor):

1802

acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',

1803

'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',

1804

'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',

1805

'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',

1806

'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',

1807

'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',

1808

'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',

1809

'thead', 'tr', 'tt', 'u', 'ul', 'var']

1810

1811

acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',

1812

'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',

1813

'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',

1814

'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',

1815

'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',

1816

'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',

1817

'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',

1818

'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',

1819

'span', 'src', 'start', 'summary', 'tabindex', 'title', 'type',

1820

'usemap', 'valign', 'value', 'vspace', 'width']

1821

1822

unacceptable_elements_with_end_tag = ['script', 'applet']

1823

1824

def reset(self):

1825

_BaseHTMLProcessor.reset(self)

1826

self.unacceptablestack = 0

1827

1828

def unknown_starttag(self, tag, attrs):

1829

if not tag in self.acceptable_elements:

1830

if tag in self.unacceptable_elements_with_end_tag:

1831

self.unacceptablestack += 1

1832

return

1833

attrs = self.normalize_attrs(attrs)

1834

attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]

1835

_BaseHTMLProcessor.unknown_starttag(self, tag, attrs)

1836

1837

def unknown_endtag(self, tag):

1838

if not tag in self.acceptable_elements:

1839

if tag in self.unacceptable_elements_with_end_tag:

1840

self.unacceptablestack -= 1

1841

return

1842

_BaseHTMLProcessor.unknown_endtag(self, tag)

1843

1844

def handle_pi(self, text):

1845

pass

1846

1847

def handle_decl(self, text):

1848

pass

1849

1850

def handle_data(self, text):

1851

if not self.unacceptablestack:

1852

_BaseHTMLProcessor.handle_data(self, text)

1853

1854

def sanitizeHTML(htmlSource, encoding):

1855

p = _HTMLSanitizer(encoding)

1856

p.feed(htmlSource)

1857

data = p.output()

1858

if TIDY_MARKUP:

1859

# loop through list of preferred Tidy interfaces looking for one that's installed,

1860

# then set up a common _tidy function to wrap the interface-specific API.

1861

_tidy = None

1862

for tidy_interface in PREFERRED_TIDY_INTERFACES:

1863

try:

1864

if tidy_interface == "uTidy":

1865

from tidy import parseString as _utidy

1866

def _tidy(data, **kwargs):

1867

return str(_utidy(data, **kwargs))

1868

break

1869

elif tidy_interface == "mxTidy":

1870

from mx.Tidy import Tidy as _mxtidy

1871

def _tidy(data, **kwargs):

1872

nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)

1873

return data

1874

break

1875

except (SystemExit, KeyboardInterrupt):

1876

raise

1877

except:

1878

pass

1879

if _tidy:

1880

utf8 = type(data) == type(u'')

1881

if utf8:

1882

data = data.encode('utf-8')

1883

data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")

1884

if utf8:

1885

data = unicode(data, 'utf-8')

1886

if data.count('<body'):

1887

data = data.split('<body', 1)[1]

1888

if data.count('>'):

1889

data = data.split('>', 1)[1]

1890

if data.count('</body'):

1891

data = data.split('</body', 1)[0]

1892

data = data.strip().replace('\r\n', '\n')

1893

return data

1894

1895

class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):

1896

def http_error_default(self, req, fp, code, msg, headers):

1897

if ((code / 100) == 3) and (code != 304):

1898

return self.http_error_302(req, fp, code, msg, headers)

1899

infourl = urllib.addinfourl(fp, headers, req.get_full_url())

1900

infourl.status = code

1901

return infourl

1902

1903

def http_error_302(self, req, fp, code, msg, headers):

1904

if headers.dict.has_key('location'):

1905

infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)

1906

else:

1907

infourl = urllib.addinfourl(fp, headers, req.get_full_url())

1908

if not hasattr(infourl, 'status'):

1909

infourl.status = code

1910

return infourl

1911

1912

def http_error_301(self, req, fp, code, msg, headers):

1913

if headers.dict.has_key('location'):

1914

infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)

1915

else:

1916

infourl = urllib.addinfourl(fp, headers, req.get_full_url())

1917

if not hasattr(infourl, 'status'):

1918

infourl.status = code

1919

return infourl

1920

1921

http_error_300 = http_error_302

1922

http_error_303 = http_error_302

1923

http_error_307 = http_error_302

1924

1925

def http_error_401(self, req, fp, code, msg, headers):

1926

# Check if

1927

# - server requires digest auth, AND

1928

# - we tried (unsuccessfully) with basic auth, AND

1929

# - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)

1930

# If all conditions hold, parse authentication information

1931

# out of the Authorization header we sent the first time

1932

# (for the username and password) and the WWW-Authenticate

1933

# header the server sent back (for the realm) and retry

1934

# the request with the appropriate digest auth headers instead.

1935

# This evil genius hack has been brought to you by Aaron Swartz.

1936

host = urlparse.urlparse(req.get_full_url())[1]

1937

try:

1938

assert sys.version.split()[0] >= '2.3.3'

1939

assert base64 != None

1940

user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')

1941

realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]

1942

self.add_password(realm, host, user, passw)

1943

retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)

1944

self.reset_retry_count()

1945

return retry

1946

except (SystemExit, KeyboardInterrupt):

1947

raise

1948

except:

1949

return self.http_error_default(req, fp, code, msg, headers)

1950

1951

def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):

1952

"""URL, filename, or string --> stream

1953

1954

This function lets you define parsers that take any input source

1955

(URL, pathname to local or network file, or actual data as a string)

1956

and deal with it in a uniform manner. Returned object is guaranteed

1957

to have all the basic stdio read methods (read, readline, readlines).

1958

Just .close() the object when you're done with it.

1959

1960

If the etag argument is supplied, it will be used as the value of an

1961

If-None-Match request header.

1962

1963

If the modified argument is supplied, it must be a tuple of 9 integers

1964

as returned by gmtime() in the standard Python time module. This MUST

1965

be in GMT (Greenwich Mean Time). The formatted date/time will be used

1966

as the value of an If-Modified-Since request header.

1967

1968

If the agent argument is supplied, it will be used as the value of a

1969

User-Agent request header.

1970

1971

If the referrer argument is supplied, it will be used as the value of a

1972

Referer[sic] request header.

1973

1974

If handlers is supplied, it is a list of handlers used to build a

1975

urllib2 opener.

1976

"""

1977

1978

if hasattr(url_file_stream_or_string, 'read'):

1979

return url_file_stream_or_string

1980

1981

if url_file_stream_or_string == '-':

1982

return sys.stdin

1983

1984

if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):

1985

if not agent:

1986

agent = USER_AGENT

1987

# test for inline user:password for basic auth

1988

auth = None

1989

if base64:

1990

urltype, rest = urllib.splittype(url_file_stream_or_string)

1991

realhost, rest = urllib.splithost(rest)

1992

if realhost:

1993

user_passwd, realhost = urllib.splituser(realhost)

1994

if user_passwd:

1995

url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)

1996

auth = base64.encodestring(user_passwd).strip()

1997

# try to open with urllib2 (to use optional headers)

1998

request = urllib2.Request(url_file_stream_or_string)

1999

request.add_header('User-Agent', agent)

2000

if etag:

2001

request.add_header('If-None-Match', etag)

2002

if modified:

2003

# format into an RFC 1123-compliant timestamp. We can't use

2004

# time.strftime() since the %a and %b directives can be affected

2005

# by the current locale, but RFC 2616 states that dates must be

2006

# in English.

2007

short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

2008

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

2009

request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))

2010

if referrer:

2011

request.add_header('Referer', referrer)

2012

if gzip and zlib:

2013

request.add_header('Accept-encoding', 'gzip, deflate')

2014

elif gzip:

2015

request.add_header('Accept-encoding', 'gzip')

2016

elif zlib:

2017

request.add_header('Accept-encoding', 'deflate')

2018

else:

2019

request.add_header('Accept-encoding', '')

2020

if auth:

2021

request.add_header('Authorization', 'Basic %s' % auth)

2022

if ACCEPT_HEADER:

2023

request.add_header('Accept', ACCEPT_HEADER)

2024

request.add_header('A-IM', 'feed') # RFC 3229 support

2025

opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))

2026

opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent

2027

try:

2028

return opener.open(request)

2029

finally:

2030

opener.close() # JohnD

2031

2032

# try to open with native open function (if url_file_stream_or_string is a filename)

2033

try:

2034

return open(url_file_stream_or_string)

2035

except (SystemExit, KeyboardInterrupt):

2036

raise

2037

except:

2038

pass

2039

2040

# treat url_file_stream_or_string as string

2041

return _StringIO(str(url_file_stream_or_string))

2042

2043

_date_handlers = []

2044

def registerDateHandler(func):

2045

'''Register a date handler function (takes string, returns 9-tuple date in GMT)'''

2046

_date_handlers.insert(0, func)

2047

2048

# ISO-8601 date parsing routines written by Fazal Majid.

2049

# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601

2050

# parser is beyond the scope of feedparser and would be a worthwhile addition

2051

# to the Python library.

2052

# A single regular expression cannot parse ISO 8601 date formats into groups

2053

# as the standard is highly irregular (for instance is 030104 2003-01-04 or

2054

# 0301-04-01), so we use templates instead.

2055

# Please note the order in templates is significant because we need a

2056

# greedy match.

2057

_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',

2058

'YY-?MM-?DD', 'YY-?OOO', 'YYYY',

2059

'-YY-?MM', '-OOO', '-YY',

2060

'--MM-?DD', '--MM',

2061

'---DD',

2062

'CC', '']

2063

_iso8601_re = [

2064

tmpl.replace(

2065

'YYYY', r'(?P<year>\d{4})').replace(

2066

'YY', r'(?P<year>\d\d)').replace(

2067

'MM', r'(?P<month>[01]\d)').replace(

2068

'DD', r'(?P<day>[0123]\d)').replace(

2069

'OOO', r'(?P<ordinal>[0123]\d\d)').replace(

2070

'CC', r'(?P<century>\d\d$)')

2071

+ r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'

2072

+ r'(:(?P<second>\d{2}))?'

2073

+ r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'

2074

for tmpl in _iso8601_tmpl]

2075

del tmpl

2076

_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]

2077

del regex

2078

def _parse_date_iso8601(dateString):

2079

'''Parse a variety of ISO-8601-compatible formats like 20040105'''

2080

m = None

2081

for _iso8601_match in _iso8601_matches:

2082

m = _iso8601_match(dateString)

2083

if m: break

2084

if not m: return

2085

if m.span() == (0, 0): return

2086

params = m.groupdict()

2087

ordinal = params.get('ordinal', 0)

2088

if ordinal:

2089

ordinal = int(ordinal)

2090

else:

2091

ordinal = 0

2092

year = params.get('year', '--')

2093

if not year or year == '--':

2094

year = time.gmtime()[0]

2095

elif len(year) == 2:

2096

# ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993

2097

year = 100 * int(time.gmtime()[0] / 100) + int(year)

2098

else:

2099

year = int(year)

2100

month = params.get('month', '-')

2101

if not month or month == '-':

2102

# ordinals are NOT normalized by mktime, we simulate them

2103

# by setting month=1, day=ordinal

2104

if ordinal:

2105

month = 1

2106

else:

2107

month = time.gmtime()[1]

2108

month = int(month)

2109

day = params.get('day', 0)

2110

if not day:

2111

# see above

2112

if ordinal:

2113

day = ordinal

2114

elif params.get('century', 0) or \

2115

params.get('year', 0) or params.get('month', 0):

2116

day = 1

2117

else:

2118

day = time.gmtime()[2]

2119

else:

2120

day = int(day)

2121

# special case of the century - is the first year of the 21st century

2122

# 2000 or 2001 ? The debate goes on...

2123

if 'century' in params.keys():

2124

year = (int(params['century']) - 1) * 100 + 1

2125

# in ISO 8601 most fields are optional

2126

for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:

2127

if not params.get(field, None):

2128

params[field] = 0

2129

hour = int(params.get('hour', 0))

2130

minute = int(params.get('minute', 0))

2131

second = int(params.get('second', 0))

2132

# weekday is normalized by mktime(), we can ignore it

2133

weekday = 0

2134

# daylight savings is complex, but not needed for feedparser's purposes

2135

# as time zones, if specified, include mention of whether it is active

2136

# (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and

2137

# and most implementations have DST bugs

2138

daylight_savings_flag = 0

2139

tm = [year, month, day, hour, minute, second, weekday,

2140

ordinal, daylight_savings_flag]

2141

# ISO 8601 time zone adjustments

2142

tz = params.get('tz')

2143

if tz and tz != 'Z':

2144

if tz[0] == '-':

2145

tm[3] += int(params.get('tzhour', 0))

2146

tm[4] += int(params.get('tzmin', 0))

2147

elif tz[0] == '+':

2148

tm[3] -= int(params.get('tzhour', 0))

2149

tm[4] -= int(params.get('tzmin', 0))

2150

else:

2151

return None

2152

# Python's time.mktime() is a wrapper around the ANSI C mktime(3c)

2153

# which is guaranteed to normalize d/m/y/h/m/s.

2154

# Many implementations have bugs, but we'll pretend they don't.

2155

return time.localtime(time.mktime(tm))

2156

registerDateHandler(_parse_date_iso8601)

2157

2158

# 8-bit date handling routines written by ytrewq1.

2159

_korean_year = u'\ub144' # b3e2 in euc-kr

2160

_korean_month = u'\uc6d4' # bff9 in euc-kr

2161

_korean_day = u'\uc77c' # c0cf in euc-kr

2162

_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr

2163

_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr

2164

2165

_korean_onblog_date_re = \

2166

re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \

2167

(_korean_year, _korean_month, _korean_day))

2168

_korean_nate_date_re = \

2169

re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \

2170

(_korean_am, _korean_pm))

2171

def _parse_date_onblog(dateString):

2172

'''Parse a string according to the OnBlog 8-bit date format'''

2173

m = _korean_onblog_date_re.match(dateString)

2174

if not m: return

2175

w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \

2176

{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\

2177

'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\

2178

'zonediff': '+09:00'}

2179

if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)

2180

return _parse_date_w3dtf(w3dtfdate)

2181

registerDateHandler(_parse_date_onblog)

2182

2183

def _parse_date_nate(dateString):

2184

'''Parse a string according to the Nate 8-bit date format'''

2185

m = _korean_nate_date_re.match(dateString)

2186

if not m: return

2187

hour = int(m.group(5))

2188

ampm = m.group(4)

2189

if (ampm == _korean_pm):

2190

hour += 12

2191

hour = str(hour)

2192

if len(hour) == 1:

2193

hour = '0' + hour

2194

w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \

2195

{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\

2196

'hour': hour, 'minute': m.group(6), 'second': m.group(7),\

2197

'zonediff': '+09:00'}

2198

if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)

2199

return _parse_date_w3dtf(w3dtfdate)

2200

registerDateHandler(_parse_date_nate)

2201

2202

_mssql_date_re = \

2203

re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')

2204

def _parse_date_mssql(dateString):

2205

'''Parse a string according to the MS SQL date format'''

2206

m = _mssql_date_re.match(dateString)

2207

if not m: return

2208

w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \

2209

{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\

2210

'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\

2211

'zonediff': '+09:00'}

2212

if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)

2213

return _parse_date_w3dtf(w3dtfdate)

2214

registerDateHandler(_parse_date_mssql)

2215

2216

# Unicode strings for Greek date strings

2217

_greek_months = \

2218

{ \

2219

u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7

2220

u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7

2221

u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7

2222

u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7

2223

u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7

2224

u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7

2225

u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7

2226

u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7

2227

u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7

2228

u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7

2229

u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7

2230

u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7

2231

u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7

2232

u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7

2233

u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7

2234

u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7

2235

u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7

2236

u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7

2237

u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7

2238

}

2239

2240

_greek_wdays = \

2241

{ \

2242

u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7

2243

u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7

2244

u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7

2245

u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7

2246

u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7

2247

u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7

2248

u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7

2249

}

2250

2251

_greek_date_format_re = \

2252

re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')

2253

2254

def _parse_date_greek(dateString):

2255

'''Parse a string according to a Greek 8-bit date format.'''

2256

m = _greek_date_format_re.match(dateString)

2257

if not m: return

2258

try:

2259

wday = _greek_wdays[m.group(1)]

2260

month = _greek_months[m.group(3)]

2261

except (SystemExit, KeyboardInterrupt):

2262

raise

2263

except:

2264

return

2265

rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \

2266

{'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\

2267

'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\

2268

'zonediff': m.group(8)}

2269

if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)

2270

return _parse_date_rfc822(rfc822date)

2271

registerDateHandler(_parse_date_greek)

2272

2273

# Unicode strings for Hungarian date strings

2274

_hungarian_months = \

2275

{ \

2276

u'janu\u00e1r': u'01', # e1 in iso-8859-2

2277

u'febru\u00e1ri': u'02', # e1 in iso-8859-2

2278

u'm\u00e1rcius': u'03', # e1 in iso-8859-2

2279

u'\u00e1prilis': u'04', # e1 in iso-8859-2

2280

u'm\u00e1ujus': u'05', # e1 in iso-8859-2

2281

u'j\u00fanius': u'06', # fa in iso-8859-2

2282

u'j\u00falius': u'07', # fa in iso-8859-2

2283

u'augusztus': u'08',

2284

u'szeptember': u'09',

2285

u'okt\u00f3ber': u'10', # f3 in iso-8859-2

2286

u'november': u'11',

2287

u'december': u'12',

2288

}

2289

2290

_hungarian_date_format_re = \

2291

re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')

2292

2293

def _parse_date_hungarian(dateString):

2294

'''Parse a string according to a Hungarian 8-bit date format.'''

2295

m = _hungarian_date_format_re.match(dateString)

2296

if not m: return

2297

try:

2298

month = _hungarian_months[m.group(2)]

2299

day = m.group(3)

2300

if len(day) == 1:

2301

day = '0' + day

2302

hour = m.group(4)

2303

if len(hour) == 1:

2304

hour = '0' + hour

2305

except (SystemExit, KeyboardInterrupt):

2306

raise

2307

except:

2308

return

2309

w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \

2310

{'year': m.group(1), 'month': month, 'day': day,\

2311

'hour': hour, 'minute': m.group(5),\

2312

'zonediff': m.group(6)}

2313

if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)

2314

return _parse_date_w3dtf(w3dtfdate)

2315

registerDateHandler(_parse_date_hungarian)

2316

2317

# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by

2318

# Drake and licensed under the Python license. Removed all range checking

2319

# for month, day, hour, minute, and second, since mktime will normalize

2320

# these later

2321

def _parse_date_w3dtf(dateString):

2322

def __extract_date(m):

2323

year = int(m.group('year'))

2324

if year < 100:

2325

year = 100 * int(time.gmtime()[0] / 100) + int(year)

2326

if year < 1000:

2327

return 0, 0, 0

2328

julian = m.group('julian')

2329

if julian:

2330

julian = int(julian)

2331

month = julian / 30 + 1

2332

day = julian % 30 + 1

2333

jday = None

2334

while jday != julian:

2335

t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))

2336

jday = time.gmtime(t)[-2]

2337

diff = abs(jday - julian)

2338

if jday > julian:

2339

if diff < day:

2340

day = day - diff

2341

else:

2342

month = month - 1

2343

day = 31

2344

elif jday < julian:

2345

if day + diff < 28:

2346

day = day + diff

2347

else:

2348

month = month + 1

2349

return year, month, day

2350

month = m.group('month')

2351

day = 1

2352

if month is None:

2353

month = 1

2354

else:

2355

month = int(month)

2356

day = m.group('day')

2357

if day:

2358

day = int(day)

2359

else:

2360

day = 1

2361

return year, month, day

2362

2363

def __extract_time(m):

2364

if not m:

2365

return 0, 0, 0

2366

hours = m.group('hours')

2367

if not hours:

2368

return 0, 0, 0

2369

hours = int(hours)

2370

minutes = int(m.group('minutes'))

2371

seconds = m.group('seconds')

2372

if seconds:

2373

seconds = int(seconds)

2374

else:

2375

seconds = 0

2376

return hours, minutes, seconds

2377

2378

def __extract_tzd(m):

2379

'''Return the Time Zone Designator as an offset in seconds from UTC.'''

2380

if not m:

2381

return 0

2382

tzd = m.group('tzd')

2383

if not tzd:

2384

return 0

2385

if tzd == 'Z':

2386

return 0

2387

hours = int(m.group('tzdhours'))

2388

minutes = m.group('tzdminutes')

2389

if minutes:

2390

minutes = int(minutes)

2391

else:

2392

minutes = 0

2393

offset = (hours*60 + minutes) * 60

2394

if tzd[0] == '+':

2395

return -offset

2396

return offset

2397

2398

__date_re = ('(?P<year>\d\d\d\d)'

2399

'(?:(?P<dsep>-|)'

2400

'(?:(?P<julian>\d\d\d)'

2401

'|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?')

2402

__tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'

2403

__tzd_rx = re.compile(__tzd_re)

2404

__time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'

2405

'(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?'

2406

+ __tzd_re)

2407

__datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)

2408

__datetime_rx = re.compile(__datetime_re)

2409

m = __datetime_rx.match(dateString)

2410

if (m is None) or (m.group() != dateString): return

2411

gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)

2412

if gmt[0] == 0: return

2413

return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)

2414

registerDateHandler(_parse_date_w3dtf)

2415

2416

def _parse_date_rfc822(dateString):

2417

'''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''

2418

data = dateString.split()

2419

if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:

2420

del data[0]

2421

if len(data) == 4:

2422

s = data[3]

2423

i = s.find('+')

2424

if i > 0:

2425

data[3:] = [s[:i], s[i+1:]]

2426

else:

2427

data.append('')

2428

dateString = " ".join(data)

2429

if len(data) < 5:

2430

dateString += ' 00:00:00 GMT'

2431

tm = rfc822.parsedate_tz(dateString)

2432

if tm:

2433

return time.gmtime(rfc822.mktime_tz(tm))

2434

# rfc822.py defines several time zones, but we define some extra ones.

2435

# 'ET' is equivalent to 'EST', etc.

2436

_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}

2437

rfc822._timezones.update(_additional_timezones)

2438

registerDateHandler(_parse_date_rfc822)

2439

2440

def _parse_date(dateString):

2441

'''Parses a variety of date formats into a 9-tuple in GMT'''

2442

for handler in _date_handlers:

2443

try:

2444

date9tuple = handler(dateString)

2445

if not date9tuple: continue

2446

if len(date9tuple) != 9:

2447

if _debug: sys.stderr.write('date handler function must return 9-tuple\n')

2448

raise ValueError

2449

map(int, date9tuple)

2450

return date9tuple

2451

except Exception, e:

2452

if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))

2453

pass

2454

return None

2455

2456

def _getCharacterEncoding(http_headers, xml_data):

2457

'''Get the character encoding of the XML document

2458

2459

http_headers is a dictionary

2460

xml_data is a raw string (not Unicode)

2461

2462

This is so much trickier than it sounds, it's not even funny.

2463

According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type

2464

is application/xml, application/*+xml,

2465

application/xml-external-parsed-entity, or application/xml-dtd,

2466

the encoding given in the charset parameter of the HTTP Content-Type

2467

takes precedence over the encoding given in the XML prefix within the

2468

document, and defaults to 'utf-8' if neither are specified. But, if

2469

the HTTP Content-Type is text/xml, text/*+xml, or

2470

text/xml-external-parsed-entity, the encoding given in the XML prefix

2471

within the document is ALWAYS IGNORED and only the encoding given in

2472

the charset parameter of the HTTP Content-Type header should be

2473

respected, and it defaults to 'us-ascii' if not specified.

2474

2475

Furthermore, discussion on the atom-syntax mailing list with the

2476

author of RFC 3023 leads me to the conclusion that any document

2477

served with a Content-Type of text/* and no charset parameter

2478

must be treated as us-ascii. (We now do this.) And also that it

2479

must always be flagged as non-well-formed. (We now do this too.)

2480

2481

If Content-Type is unspecified (input was local file or non-HTTP source)

2482

or unrecognized (server just got it totally wrong), then go by the

2483

encoding given in the XML prefix of the document and default to

2484

'iso-8859-1' as per the HTTP specification (RFC 2616).

2485

2486

Then, assuming we didn't find a character encoding in the HTTP headers

2487

(and the HTTP Content-type allowed us to look in the body), we need

2488

to sniff the first few bytes of the XML data and try to determine

2489

whether the encoding is ASCII-compatible. Section F of the XML

2490

specification shows the way here:

2491

http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info

2492

2493

If the sniffed encoding is not ASCII-compatible, we need to make it

2494

ASCII compatible so that we can sniff further into the XML declaration

2495

to find the encoding attribute, which will tell us the true encoding.

2496

2497

Of course, none of this guarantees that we will be able to parse the

2498

feed in the declared character encoding (assuming it was declared

2499

correctly, which many are not). CJKCodecs and iconv_codec help a lot;

2500

you should definitely install them if you can.

2501

http://cjkpython.i18n.org/

2502

'''

2503

2504

def _parseHTTPContentType(content_type):

2505

'''takes HTTP Content-Type header and returns (content type, charset)

2506

2507

If no charset is specified, returns (content type, '')

2508

If no content type is specified, returns ('', '')

2509

Both return parameters are guaranteed to be lowercase strings

2510

'''

2511

content_type = content_type or ''

2512

content_type, params = cgi.parse_header(content_type)

2513

return content_type, params.get('charset', '').replace("'", '')

2514

2515

sniffed_xml_encoding = ''

2516

xml_encoding = ''

2517

true_encoding = ''

2518

http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))

2519

# Must sniff for non-ASCII-compatible character encodings before

2520

# searching for XML declaration. This heuristic is defined in

2521

# section F of the XML specification:

2522

# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info

2523

try:

2524

if xml_data[:4] == '\x4c\x6f\xa7\x94':

2525

# EBCDIC

2526

xml_data = _ebcdic_to_ascii(xml_data)

2527

elif xml_data[:4] == '\x00\x3c\x00\x3f':

2528

# UTF-16BE

2529

sniffed_xml_encoding = 'utf-16be'

2530

xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')

2531

elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):

2532

# UTF-16BE with BOM

2533

sniffed_xml_encoding = 'utf-16be'

2534

xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')

2535

elif xml_data[:4] == '\x3c\x00\x3f\x00':

2536

# UTF-16LE

2537

sniffed_xml_encoding = 'utf-16le'

2538

xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')

2539

elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):

2540

# UTF-16LE with BOM

2541

sniffed_xml_encoding = 'utf-16le'

2542

xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')

2543

elif xml_data[:4] == '\x00\x00\x00\x3c':

2544

# UTF-32BE

2545

sniffed_xml_encoding = 'utf-32be'

2546

xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')

2547

elif xml_data[:4] == '\x3c\x00\x00\x00':

2548

# UTF-32LE

2549

sniffed_xml_encoding = 'utf-32le'

2550

xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')

2551

elif xml_data[:4] == '\x00\x00\xfe\xff':

2552

# UTF-32BE with BOM

2553

sniffed_xml_encoding = 'utf-32be'

2554

xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')

2555

elif xml_data[:4] == '\xff\xfe\x00\x00':

2556

# UTF-32LE with BOM

2557

sniffed_xml_encoding = 'utf-32le'

2558

xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')

2559

elif xml_data[:3] == '\xef\xbb\xbf':

2560

# UTF-8 with BOM

2561

sniffed_xml_encoding = 'utf-8'

2562

xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')

2563

else:

2564

# ASCII-compatible

2565

pass

2566

xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)

2567

except (SystemExit, KeyboardInterrupt):

2568

raise

2569

except:

2570

xml_encoding_match = None

2571

if xml_encoding_match:

2572

xml_encoding = xml_encoding_match.groups()[0].lower()

2573

if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):

2574

xml_encoding = sniffed_xml_encoding

2575

acceptable_content_type = 0

2576

application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')

2577

text_content_types = ('text/xml', 'text/xml-external-parsed-entity')

2578

if (http_content_type in application_content_types) or \

2579

(http_content_type.startswith('application/') and http_content_type.endswith('+xml')):

2580

acceptable_content_type = 1

2581

true_encoding = http_encoding or xml_encoding or 'utf-8'

2582

elif (http_content_type in text_content_types) or \

2583

(http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):

2584

acceptable_content_type = 1

2585

true_encoding = http_encoding or 'us-ascii'

2586

elif http_content_type.startswith('text/'):

2587

true_encoding = http_encoding or 'us-ascii'

2588

elif http_headers and (not http_headers.has_key('content-type')):

2589

true_encoding = xml_encoding or 'iso-8859-1'

2590

else:

2591

true_encoding = xml_encoding or 'utf-8'

2592

return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type

2593

2594

def _toUTF8(data, encoding):

2595

'''Changes an XML data stream on the fly to specify a new encoding

2596

2597

data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already

2598

encoding is a string recognized by encodings.aliases

2599

'''

2600

if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)

2601

# strip Byte Order Mark (if present)

2602

if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):

2603

if _debug:

2604

sys.stderr.write('stripping BOM\n')

2605

if encoding != 'utf-16be':

2606

sys.stderr.write('trying utf-16be instead\n')

2607

encoding = 'utf-16be'

2608

data = data[2:]

2609

elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):

2610

if _debug:

2611

sys.stderr.write('stripping BOM\n')

2612

if encoding != 'utf-16le':

2613

sys.stderr.write('trying utf-16le instead\n')

2614

encoding = 'utf-16le'

2615

data = data[2:]

2616

elif data[:3] == '\xef\xbb\xbf':

2617

if _debug:

2618

sys.stderr.write('stripping BOM\n')

2619

if encoding != 'utf-8':

2620

sys.stderr.write('trying utf-8 instead\n')

2621

encoding = 'utf-8'

2622

data = data[3:]

2623

elif data[:4] == '\x00\x00\xfe\xff':

2624

if _debug:

2625

sys.stderr.write('stripping BOM\n')

2626

if encoding != 'utf-32be':

2627

sys.stderr.write('trying utf-32be instead\n')

2628

encoding = 'utf-32be'

2629

data = data[4:]

2630

elif data[:4] == '\xff\xfe\x00\x00':

2631

if _debug:

2632

sys.stderr.write('stripping BOM\n')

2633

if encoding != 'utf-32le':

2634

sys.stderr.write('trying utf-32le instead\n')

2635

encoding = 'utf-32le'

2636

data = data[4:]

2637

newdata = unicode(data, encoding)

2638

if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)

2639

declmatch = re.compile('^<\?xml[^>]*?>')

2640

newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''

2641

if declmatch.search(newdata):

2642

newdata = declmatch.sub(newdecl, newdata)

2643

else:

2644

newdata = newdecl + u'\n' + newdata

2645

return newdata.encode('utf-8')

2646

2647

def _stripDoctype(data):

2648

'''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)

2649

2650

rss_version may be 'rss091n' or None

2651

stripped_data is the same XML document, minus the DOCTYPE

2652

'''

2653

entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE)

2654

data = entity_pattern.sub('', data)

2655

doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)

2656

doctype_results = doctype_pattern.findall(data)

2657

doctype = doctype_results and doctype_results[0] or ''

2658

if doctype.lower().count('netscape'):

2659

version = 'rss091n'

2660

else:

2661

version = None

2662

data = doctype_pattern.sub('', data)

2663

return version, data

2664

2665

def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):

2666

'''Parse a feed from a URL, file, stream, or string'''

2667

result = FeedParserDict()

2668

result['feed'] = FeedParserDict()

2669

result['entries'] = []

2670

if _XML_AVAILABLE:

2671

result['bozo'] = 0

2672

if type(handlers) == types.InstanceType:

2673

handlers = [handlers]

2674

try:

2675

f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)

2676

data = f.read()

2677

except Exception, e:

2678

result['bozo'] = 1

2679

result['bozo_exception'] = e

2680

data = ''

2681

f = None

2682

2683

# if feed is gzip-compressed, decompress it

2684

if f and data and hasattr(f, 'headers'):

2685

if gzip and f.headers.get('content-encoding', '') == 'gzip':

2686

try:

2687

data = gzip.GzipFile(fileobj=_StringIO(data)).read()

2688

except Exception, e:

2689

# Some feeds claim to be gzipped but they're not, so

2690

# we get garbage. Ideally, we should re-request the

2691

# feed without the 'Accept-encoding: gzip' header,

2692

# but we don't.

2693

result['bozo'] = 1

2694

result['bozo_exception'] = e

2695

data = ''

2696

elif zlib and f.headers.get('content-encoding', '') == 'deflate':

2697

try:

2698

data = zlib.decompress(data, -zlib.MAX_WBITS)

2699

except Exception, e:

2700

result['bozo'] = 1

2701

result['bozo_exception'] = e

2702

data = ''

2703

2704

# save HTTP headers

2705

if hasattr(f, 'info'):

2706

info = f.info()

2707

result['etag'] = info.getheader('ETag')

2708

last_modified = info.getheader('Last-Modified')

2709

if last_modified:

2710

result['modified'] = _parse_date(last_modified)

2711

if hasattr(f, 'url'):

2712

result['href'] = f.url

2713

result['status'] = 200

2714

if hasattr(f, 'status'):

2715

result['status'] = f.status

2716

if hasattr(f, 'headers'):

2717

result['headers'] = f.headers.dict

2718

if hasattr(f, 'close'):

2719

f.close()

2720

2721

# there are four encodings to keep track of:

2722

# - http_encoding is the encoding declared in the Content-Type HTTP header

2723

# - xml_encoding is the encoding declared in the <?xml declaration

2724

# - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data

2725

# - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications

2726

http_headers = result.get('headers', {})

2727

result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \

2728

_getCharacterEncoding(http_headers, data)

2729

if http_headers and (not acceptable_content_type):

2730

if http_headers.has_key('content-type'):

2731

bozo_message = '%s is not an XML media type' % http_headers['content-type']

2732

else:

2733

bozo_message = 'no Content-type specified'

2734

result['bozo'] = 1

2735

result['bozo_exception'] = NonXMLContentType(bozo_message)

2736

2737

result['version'], data = _stripDoctype(data)

2738

2739

baseuri = http_headers.get('content-location', result.get('href'))

2740

baselang = http_headers.get('content-language', None)

2741

2742

# if server sent 304, we're done

2743

if result.get('status', 0) == 304:

2744

result['version'] = ''

2745

result['debug_message'] = 'The feed has not changed since you last checked, ' + \

2746

'so the server sent no data. This is a feature, not a bug!'

2747

return result

2748

2749

# if there was a problem downloading, we're done

2750

if not data:

2751

return result

2752

2753

# determine character encoding

2754

use_strict_parser = 0

2755

known_encoding = 0

2756

tried_encodings = []

2757

# try: HTTP encoding, declared XML encoding, encoding sniffed from BOM

2758

for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):

2759

if not proposed_encoding: continue

2760

if proposed_encoding in tried_encodings: continue

2761

tried_encodings.append(proposed_encoding)

2762

try:

2763

data = _toUTF8(data, proposed_encoding)

2764

known_encoding = use_strict_parser = 1

2765

break

2766

except (SystemExit, KeyboardInterrupt):

2767

raise

2768

except:

2769

pass

2770

# if no luck and we have auto-detection library, try that

2771

if (not known_encoding) and chardet:

2772

try:

2773

proposed_encoding = chardet.detect(data)['encoding']

2774

if proposed_encoding and (proposed_encoding not in tried_encodings):

2775

tried_encodings.append(proposed_encoding)

2776

data = _toUTF8(data, proposed_encoding)

2777

known_encoding = use_strict_parser = 1

2778

except (SystemExit, KeyboardInterrupt):

2779

raise

2780

except:

2781

pass

2782

# if still no luck and we haven't tried utf-8 yet, try that

2783

if (not known_encoding) and ('utf-8' not in tried_encodings):

2784

try:

2785

proposed_encoding = 'utf-8'

2786

tried_encodings.append(proposed_encoding)

2787

data = _toUTF8(data, proposed_encoding)

2788

known_encoding = use_strict_parser = 1

2789

except (SystemExit, KeyboardInterrupt):

2790

raise

2791

except:

2792

pass

2793

# if still no luck and we haven't tried windows-1252 yet, try that

2794

if (not known_encoding) and ('windows-1252' not in tried_encodings):

2795

try:

2796

proposed_encoding = 'windows-1252'

2797

tried_encodings.append(proposed_encoding)

2798

data = _toUTF8(data, proposed_encoding)

2799

known_encoding = use_strict_parser = 1

2800

except (SystemExit, KeyboardInterrupt):

2801

raise

2802

except:

2803

pass

2804

# if still no luck, give up

2805

if not known_encoding:

2806

result['bozo'] = 1

2807

result['bozo_exception'] = CharacterEncodingUnknown( \

2808

'document encoding unknown, I tried ' + \

2809

'%s, %s, utf-8, and windows-1252 but nothing worked' % \

2810

(result['encoding'], xml_encoding))

2811

result['encoding'] = ''

2812

elif proposed_encoding != result['encoding']:

2813

result['bozo'] = 1

2814

result['bozo_exception'] = CharacterEncodingOverride( \

2815

'documented declared as %s, but parsed as %s' % \

2816

(result['encoding'], proposed_encoding))

2817

result['encoding'] = proposed_encoding

2818

2819

if not _XML_AVAILABLE:

2820

use_strict_parser = 0

2821

if use_strict_parser:

2822

# initialize the SAX parser

2823

feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')

2824

saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)

2825

saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)

2826

saxparser.setContentHandler(feedparser)

2827

saxparser.setErrorHandler(feedparser)

2828

source = xml.sax.xmlreader.InputSource()

2829

source.setByteStream(_StringIO(data))

2830

if hasattr(saxparser, '_ns_stack'):

2831

# work around bug in built-in SAX parser (doesn't recognize xml: namespace)

2832

# PyXML doesn't have this problem, and it doesn't have _ns_stack either

2833

saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})

2834

try:

2835

saxparser.parse(source)

2836

except Exception, e:

2837

if _debug:

2838

import traceback

2839

traceback.print_stack()

2840

traceback.print_exc()

2841

sys.stderr.write('xml parsing failed\n')

2842

result['bozo'] = 1

2843

result['bozo_exception'] = feedparser.exc or e

2844

use_strict_parser = 0

2845

if not use_strict_parser:

2846

feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '')

2847

feedparser.feed(data)

2848

result['feed'] = feedparser.feeddata

2849

result['entries'] = feedparser.entries

2850

result['version'] = result['version'] or feedparser.version

2851

result['namespaces'] = feedparser.namespacesInUse

2852

return result

2853

2854

if __name__ == '__main__':

2855

if not sys.argv[1:]:

2856

print __doc__

2857

sys.exit(0)

2858

else:

2859

urls = sys.argv[1:]

2860

zopeCompatibilityHack()

2861

from pprint import pprint

2862

for url in urls:

2863

print url

2864

2865

result = parse(url)

2866

pprint(result)

2867

2868

2869

#REVISION HISTORY

2870

#1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,

2871

# added Simon Fell's test suite

2872

#1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections

2873

#2.0 - 10/19/2002

2874

# JD - use inchannel to watch out for image and textinput elements which can

2875

# also contain title, link, and description elements

2876

# JD - check for isPermaLink='false' attribute on guid elements

2877

# JD - replaced openAnything with open_resource supporting ETag and

2878

# If-Modified-Since request headers

2879

# JD - parse now accepts etag, modified, agent, and referrer optional

2880

# arguments

2881

# JD - modified parse to return a dictionary instead of a tuple so that any

2882

# etag or modified information can be returned and cached by the caller

2883

#2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything

2884

# because of etag/modified, return the old etag/modified to the caller to

2885

# indicate why nothing is being returned

2886

#2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its

2887

# useless. Fixes the problem JD was addressing by adding it.

2888

#2.1 - 11/14/2002 - MAP - added gzip support

2889

#2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.

2890

# start_admingeneratoragent is an example of how to handle elements with

2891

# only attributes, no content.

2892

#2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);

2893

# also, make sure we send the User-Agent even if urllib2 isn't available.

2894

# Match any variation of backend.userland.com/rss namespace.

2895

#2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.

2896

#2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's

2897

# snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed

2898

# project name

2899

#2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);

2900

# removed unnecessary urllib code -- urllib2 should always be available anyway;

2901

# return actual url, status, and full HTTP headers (as result['url'],

2902

# result['status'], and result['headers']) if parsing a remote feed over HTTP --

2903

# this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;

2904

# added the latest namespace-of-the-week for RSS 2.0

2905

#2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom

2906

# User-Agent (otherwise urllib2 sends two, which confuses some servers)

2907

#2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for

2908

# inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds

2909

#2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or

2910

# textInput, and also to return the character encoding (if specified)

2911

#2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking

2912

# nested divs within content (JohnD); fixed missing sys import (JohanS);

2913

# fixed regular expression to capture XML character encoding (Andrei);

2914

# added support for Atom 0.3-style links; fixed bug with textInput tracking;

2915

# added support for cloud (MartijnP); added support for multiple

2916

# category/dc:subject (MartijnP); normalize content model: 'description' gets

2917

# description (which can come from description, summary, or full content if no

2918

# description), 'content' gets dict of base/language/type/value (which can come

2919

# from content:encoded, xhtml:body, content, or fullitem);

2920

# fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang

2921

# tracking; fixed bug tracking unknown tags; fixed bug tracking content when

2922

# <content> element is not in default namespace (like Pocketsoap feed);

2923

# resolve relative URLs in link, guid, docs, url, comments, wfw:comment,

2924

# wfw:commentRSS; resolve relative URLs within embedded HTML markup in

2925

# description, xhtml:body, content, content:encoded, title, subtitle,

2926

# summary, info, tagline, and copyright; added support for pingback and

2927

# trackback namespaces

2928

#2.7 - 1/5/2004 - MAP - really added support for trackback and pingback

2929

# namespaces, as opposed to 2.6 when I said I did but didn't really;

2930

# sanitize HTML markup within some elements; added mxTidy support (if

2931

# installed) to tidy HTML markup within some elements; fixed indentation

2932

# bug in _parse_date (FazalM); use socket.setdefaulttimeout if available

2933

# (FazalM); universal date parsing and normalization (FazalM): 'created', modified',

2934

# 'issued' are parsed into 9-tuple date format and stored in 'created_parsed',

2935

# 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'

2936

# and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa

2937

#2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory

2938

# leak not closing url opener (JohnD); added dc:publisher support (MarekK);

2939

# added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)

2940

#2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in

2941

# encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);

2942

# fixed relative URI processing for guid (skadz); added ICBM support; added

2943

# base64 support

2944

#2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many

2945

# blogspot.com sites); added _debug variable

2946

#2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing

2947

#3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);

2948

# added several new supported namespaces; fixed bug tracking naked markup in

2949

# description; added support for enclosure; added support for source; re-added

2950

# support for cloud which got dropped somehow; added support for expirationDate

2951

#3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking

2952

# xml:base URI, one for documents that don't define one explicitly and one for

2953

# documents that define an outer and an inner xml:base that goes out of scope

2954

# before the end of the document

2955

#3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level

2956

#3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version']

2957

# will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;

2958

# added support for creativeCommons:license and cc:license; added support for

2959

# full Atom content model in title, tagline, info, copyright, summary; fixed bug

2960

# with gzip encoding (not always telling server we support it when we do)

2961

#3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail

2962

# (dictionary of 'name', 'url', 'email'); map author to author_detail if author

2963

# contains name + email address

2964

#3.0b8 - 1/28/2004 - MAP - added support for contributor

2965

#3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added

2966

# support for summary

2967

#3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from

2968

# xml.util.iso8601

2969

#3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain

2970

# dangerous markup; fiddled with decodeEntities (not right); liberalized

2971

# date parsing even further

2972

#3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);

2973

# added support to Atom 0.2 subtitle; added support for Atom content model

2974

# in copyright; better sanitizing of dangerous HTML elements with end tags

2975

# (script, frameset)

2976

#3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,

2977

# etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />)

2978

#3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under

2979

# Python 2.1

2980

#3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;

2981

# fixed bug capturing author and contributor URL; fixed bug resolving relative

2982

# links in author and contributor URL; fixed bug resolvin relative links in

2983

# generator URL; added support for recognizing RSS 1.0; passed Simon Fell's

2984

# namespace tests, and included them permanently in the test suite with his

2985

# permission; fixed namespace handling under Python 2.1

2986

#3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)

2987

#3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023

2988

#3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);

2989

# use libxml2 (if available)

2990

#3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author

2991

# name was in parentheses; removed ultra-problematic mxTidy support; patch to

2992

# workaround crash in PyXML/expat when encountering invalid entities

2993

# (MarkMoraes); support for textinput/textInput

2994

#3.0b20 - 4/7/2004 - MAP - added CDF support

2995

#3.0b21 - 4/14/2004 - MAP - added Hot RSS support

2996

#3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in

2997

# results dict; changed results dict to allow getting values with results.key

2998

# as well as results[key]; work around embedded illformed HTML with half

2999

# a DOCTYPE; work around malformed Content-Type header; if character encoding

3000

# is wrong, try several common ones before falling back to regexes (if this

3001

# works, bozo_exception is set to CharacterEncodingOverride); fixed character

3002

# encoding issues in BaseHTMLProcessor by tracking encoding and converting

3003

# from Unicode to raw strings before feeding data to sgmllib.SGMLParser;

3004

# convert each value in results to Unicode (if possible), even if using

3005

# regex-based parsing

3006

#3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain

3007

# high-bit characters in attributes in embedded HTML in description (thanks

3008

# Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in

3009

# FeedParserDict; tweaked FeedParserDict.has_key to return True if asking

3010

# about a mapped key

3011

#3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and

3012

# results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could

3013

# cause the same encoding to be tried twice (even if it failed the first time);

3014

# fixed DOCTYPE stripping when DOCTYPE contained entity declarations;

3015

# better textinput and image tracking in illformed RSS 1.0 feeds

3016

#3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed

3017

# my blink tag tests

3018

#3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that

3019

# failed to parse utf-16 encoded feeds; made source into a FeedParserDict;

3020

# duplicate admin:generatorAgent/@rdf:resource in generator_detail.url;

3021

# added support for image; refactored parse() fallback logic to try other

3022

# encodings if SAX parsing fails (previously it would only try other encodings

3023

# if re-encoding failed); remove unichr madness in normalize_attrs now that

3024

# we're properly tracking encoding in and out of BaseHTMLProcessor; set

3025

# feed.language from root-level xml:lang; set entry.id from rdf:about;

3026

# send Accept header

3027

#3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between

3028

# iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are

3029

# windows-1252); fixed regression that could cause the same encoding to be

3030

# tried twice (even if it failed the first time)

3031

#3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types;

3032

# recover from malformed content-type header parameter with no equals sign

3033

# ('text/xml; charset:iso-8859-1')

3034

#3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities

3035

# to Unicode equivalents in illformed feeds (aaronsw); added and

3036

# passed tests for converting character entities to Unicode equivalents

3037

# in illformed feeds (aaronsw); test for valid parsers when setting

3038

# XML_AVAILABLE; make version and encoding available when server returns

3039

# a 304; add handlers parameter to pass arbitrary urllib2 handlers (like

3040

# digest auth or proxy support); add code to parse username/password

3041

# out of url and send as basic authentication; expose downloading-related

3042

# exceptions in bozo_exception (aaronsw); added __contains__ method to

3043

# FeedParserDict (aaronsw); added publisher_detail (aaronsw)

3044

#3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always

3045

# convert feed to UTF-8 before passing to XML parser; completely revamped

3046

# logic for determining character encoding and attempting XML parsing

3047

# (much faster); increased default timeout to 20 seconds; test for presence

3048

# of Location header on redirects; added tests for many alternate character

3049

# encodings; support various EBCDIC encodings; support UTF-16BE and

3050

# UTF16-LE with or without a BOM; support UTF-8 with a BOM; support

3051

# UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no

3052

# XML parsers are available; added support for 'Content-encoding: deflate';

3053

# send blank 'Accept-encoding: ' header if neither gzip nor zlib modules

3054

# are available

3055

#3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure

3056

# problem tracking xml:base and xml:lang if element declares it, child

3057

# doesn't, first grandchild redeclares it, and second grandchild doesn't;

3058

# refactored date parsing; defined public registerDateHandler so callers

3059

# can add support for additional date formats at runtime; added support

3060

# for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added

3061

# zopeCompatibilityHack() which turns FeedParserDict into a regular

3062

# dictionary, required for Zope compatibility, and also makes command-

3063

# line debugging easier because pprint module formats real dictionaries

3064

# better than dictionary-like objects; added NonXMLContentType exception,

3065

# which is stored in bozo_exception when a feed is served with a non-XML

3066

# media type such as 'text/plain'; respect Content-Language as default

3067

# language if not xml:lang is present; cloud dict is now FeedParserDict;

3068

# generator dict is now FeedParserDict; better tracking of xml:lang,

3069

# including support for xml:lang='' to unset the current language;

3070

# recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default

3071

# namespace; don't overwrite final status on redirects (scenarios:

3072

# redirecting to a URL that returns 304, redirecting to a URL that

3073

# redirects to another URL with a different type of redirect); add

3074

# support for HTTP 303 redirects

3075

#4.0 - MAP - support for relative URIs in xml:base attribute; fixed

3076

# encoding issue with mxTidy (phopkins); preliminary support for RFC 3229;

3077

# support for Atom 1.0; support for iTunes extensions; new 'tags' for

3078

# categories/keywords/etc. as array of dict

3079

# {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0

3080

# terminology; parse RFC 822-style dates with no time; lots of other

3081

# bug fixes

3082

#4.1 - MAP - removed socket timeout; added support for chardet library

Older »