~ubuntu-branches/ubuntu/natty/miro/natty

emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author)

1080

if not emailmatch: return

1081

email = emailmatch.group(0)

1082

# probably a better way to do the following, but it passes all the tests

1083

author = author.replace(email, '')

1084

author = author.replace('()', '')

1085

author = author.strip()

1086

if author and (author[0] == '('):

1087

author = author[1:]

1088

if author and (author[-1] == ')'):

1089

author = author[:-1]

1090

author = author.strip()

1091

context.setdefault('%s_detail' % key, FeedParserDict())

1092

context['%s_detail' % key]['name'] = author

1093

context['%s_detail' % key]['email'] = email

1094

1095

def _start_subtitle(self, attrsD):

1096

self.pushContent('subtitle', attrsD, 'text/plain', 1)

1097

_start_tagline = _start_subtitle

1098

_start_itunes_subtitle = _start_subtitle

1099

1100

def _end_subtitle(self):

1101

self.popContent('subtitle')

1102

_end_tagline = _end_subtitle

1103

_end_itunes_subtitle = _end_subtitle

1104

1105

def _start_rights(self, attrsD):

1106

self.pushContent('rights', attrsD, 'text/plain', 1)

1107

_start_dc_rights = _start_rights

1108

_start_copyright = _start_rights

1109

1110

def _end_rights(self):

1111

self.popContent('rights')

1112

_end_dc_rights = _end_rights

1113

_end_copyright = _end_rights

1114

1115

def _start_item(self, attrsD):

1116

self.entries.append(FeedParserDict())

1117

self.push('item', 0)

1118

self.inentry = 1

1119

self.guidislink = 0

1120

id = self._getAttribute(attrsD, 'rdf:about')

1121

if id:

1122

context = self._getContext()

1123

context['id'] = id

1124

self._cdf_common(attrsD)

1125

_start_entry = _start_item

1126

_start_product = _start_item

1127

1128

def _end_item(self):

1129

self.pop('item')

1130

self.inentry = 0

1131

_end_entry = _end_item

1132

1133

def _start_dc_language(self, attrsD):

1134

self.push('language', 1)

1135

_start_language = _start_dc_language

1136

1137

def _end_dc_language(self):

1138

self.lang = self.pop('language')

1139

_end_language = _end_dc_language

1140

1141

def _start_dc_publisher(self, attrsD):

1142

self.push('publisher', 1)

1143

_start_webmaster = _start_dc_publisher

1144

1145

def _end_dc_publisher(self):

1146

self.pop('publisher')

1147

self._sync_author_detail('publisher')

1148

_end_webmaster = _end_dc_publisher

1149

1150

def _start_published(self, attrsD):

1151

self.push('published', 1)

1152

_start_dcterms_issued = _start_published

1153

_start_issued = _start_published

1154

1155

def _end_published(self):

1156

value = self.pop('published')

1157

self._save('published_parsed', _parse_date(value))

1158

_end_dcterms_issued = _end_published

1159

_end_issued = _end_published

1160

1161

def _start_updated(self, attrsD):

1162

self.push('updated', 1)

1163

_start_modified = _start_updated

1164

_start_dcterms_modified = _start_updated

1165

_start_pubdate = _start_updated

1166

_start_dc_date = _start_updated

1167

1168

def _end_updated(self):

1169

value = self.pop('updated')

1170

parsed_value = _parse_date(value)

1171

self._save('updated_parsed', parsed_value)

1172

_end_modified = _end_updated

1173

_end_dcterms_modified = _end_updated

1174

_end_pubdate = _end_updated

1175

_end_dc_date = _end_updated

1176

1177

def _start_created(self, attrsD):

1178

self.push('created', 1)

1179

_start_dcterms_created = _start_created

1180

1181

def _end_created(self):

1182

value = self.pop('created')

1183

self._save('created_parsed', _parse_date(value))

1184

_end_dcterms_created = _end_created

1185

1186

def _start_expirationdate(self, attrsD):

1187

self.push('expired', 1)

1188

1189

def _end_expirationdate(self):

1190

self._save('expired_parsed', _parse_date(self.pop('expired')))

1191

1192

def _start_cc_license(self, attrsD):

1193

self.push('license', 1)

1194

value = self._getAttribute(attrsD, 'rdf:resource')

1195

if value:

1196

self.elementstack[-1][2].append(value)

1197

self.pop('license')

1198

1199

def _start_creativecommons_license(self, attrsD):

1200

self.push('license', 1)

1201

1202

def _end_creativecommons_license(self):

1203

self.pop('license')

1204

1205

def _addTag(self, term, scheme, label):

1206

context = self._getContext()

1207

tags = context.setdefault('tags', [])

1208

if (not term) and (not scheme) and (not label): return

1209

value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label})

1210

if value not in tags:

1211

tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label}))

1212

1213

def _start_category(self, attrsD):

1214

if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD))

1215

term = attrsD.get('term')

1216

scheme = attrsD.get('scheme', attrsD.get('domain'))

1217

label = attrsD.get('label')

1218

self._addTag(term, scheme, label)

1219

self.push('category', 1)

1220

_start_dc_subject = _start_category

1221

_start_keywords = _start_category

1222

_start_media_category = _start_category

1223

1224

def _end_itunes_keywords(self):

1225

for term in self.pop('itunes_keywords').split():

1226

self._addTag(term, 'http://www.itunes.com/', None)

1227

1228

def _start_itunes_category(self, attrsD):

1229

self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None)

1230

self.push('category', 1)

1231

1232

def _end_category(self):

1233

value = self.pop('category')

1234

if not value: return

1235

context = self._getContext()

1236

tags = context['tags']

1237

if value and len(tags) and not tags[-1]['term']:

1238

tags[-1]['term'] = value

1239

else:

1240

self._addTag(value, None, None)

1241

_end_dc_subject = _end_category

1242

_end_keywords = _end_category

1243

_end_itunes_category = _end_category

1244

_end_media_category = _end_category

1245

1246

def _start_cloud(self, attrsD):

1247

self._getContext()['cloud'] = FeedParserDict(attrsD)

1248

1249

def _start_link(self, attrsD):

1250

attrsD.setdefault('rel', 'alternate')

1251

attrsD.setdefault('type', 'text/html')

1252

attrsD = self._itsAnHrefDamnIt(attrsD)

1253

if attrsD.has_key('href'):

1254

attrsD['href'] = self.resolveURI(attrsD['href'])

1255

expectingText = self.infeed or self.inentry or self.insource

1256

context = self._getContext()

1257

context.setdefault('links', [])

1258

context['links'].append(FeedParserDict(attrsD))

1259

if attrsD['rel'] == 'enclosure':

1260

self._start_enclosure(attrsD)

1261

if attrsD.has_key('href'):

1262

expectingText = 0

1263

if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types):

1264

context['link'] = attrsD['href']

1265

else:

1266

self.push('link', expectingText)

1267

_start_producturl = _start_link

1268

1269

def _end_link(self):

1270

value = self.pop('link')

1271

context = self._getContext()

1272

if self.intextinput:

1273

context['textinput']['link'] = value

1274

if self.inimage:

1275

context['image']['link'] = value

1276

_end_producturl = _end_link

1277

1278

def _start_guid(self, attrsD):

1279

self.guidislink = (attrsD.get('ispermalink', 'true') == 'true')

1280

self.push('id', 1)

1281

1282

def _end_guid(self):

1283

value = self.pop('id')

1284

self._save('guidislink', self.guidislink and not self._getContext().has_key('link'))

1285

if self.guidislink:

1286

# guid acts as link, but only if 'ispermalink' is not present or is 'true',

1287

# and only if the item doesn't already have a link element

1288

self._save('link', value)

1289

1290

def _start_title(self, attrsD):

1291

self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)

1292

_start_dc_title = _start_title

1293

_start_media_title = _start_title

1294

1295

def _end_title(self):

1296

value = self.popContent('title')

1297

context = self._getContext()

1298

if self.intextinput:

1299

context['textinput']['title'] = value

1300

elif self.inimage:

1301

context['image']['title'] = value

1302

_end_dc_title = _end_title

1303

_end_media_title = _end_title

1304

1305

def _start_description(self, attrsD):

1306

context = self._getContext()

1307

if context.has_key('summary'):

1308

self._summaryKey = 'content'

1309

self._start_content(attrsD)

1310

else:

1311

self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource)

1312

1313

def _start_abstract(self, attrsD):

1314

self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource)

1315

1316

def _end_description(self):

1317

if self._summaryKey == 'content':

1318

self._end_content()

1319

else:

1320

value = self.popContent('description')

1321

context = self._getContext()

1322

if self.intextinput:

1323

context['textinput']['description'] = value

1324

elif self.inimage:

1325

context['image']['description'] = value

1326

self._summaryKey = None

1327

_end_abstract = _end_description

1328

1329

def _start_info(self, attrsD):

1330

self.pushContent('info', attrsD, 'text/plain', 1)

1331

_start_feedburner_browserfriendly = _start_info

1332

1333

def _end_info(self):

1334

self.popContent('info')

1335

_end_feedburner_browserfriendly = _end_info

1336

1337

def _start_generator(self, attrsD):

1338

if attrsD:

1339

attrsD = self._itsAnHrefDamnIt(attrsD)

1340

if attrsD.has_key('href'):

1341

attrsD['href'] = self.resolveURI(attrsD['href'])

1342

self._getContext()['generator_detail'] = FeedParserDict(attrsD)

1343

self.push('generator', 1)

1344

1345

def _end_generator(self):

1346

value = self.pop('generator')

1347

context = self._getContext()

1348

if context.has_key('generator_detail'):

1349

context['generator_detail']['name'] = value

1350

1351

def _start_admin_generatoragent(self, attrsD):

1352

self.push('generator', 1)

1353

value = self._getAttribute(attrsD, 'rdf:resource')

1354

if value:

1355

self.elementstack[-1][2].append(value)

1356

self.pop('generator')

1357

self._getContext()['generator_detail'] = FeedParserDict({'href': value})

1358

1359

def _start_admin_errorreportsto(self, attrsD):

1360

self.push('errorreportsto', 1)

1361

value = self._getAttribute(attrsD, 'rdf:resource')

1362

if value:

1363

self.elementstack[-1][2].append(value)

1364

self.pop('errorreportsto')

1365

1366

def _start_summary(self, attrsD):

1367

context = self._getContext()

1368

if context.has_key('summary'):

1369

self._summaryKey = 'content'

1370

self._start_content(attrsD)

1371

else:

1372

self._summaryKey = 'summary'

1373

self.pushContent(self._summaryKey, attrsD, 'text/plain', 1)

1374

_start_itunes_summary = _start_summary

1375

1376

def _end_summary(self):

1377

if self._summaryKey == 'content':

1378

self._end_content()

1379

else:

1380

self.popContent(self._summaryKey or 'summary')

1381

self._summaryKey = None

1382

_end_itunes_summary = _end_summary

1383

1384

def _start_enclosure(self, attrsD):

1385

self.inenclosure += 1

1386

attrsD = self._itsAnHrefDamnIt(attrsD)

1387

self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD))

1388

_start_media_content = _start_enclosure

1389

1390

def _end_enclosure(self):

1391

self.inenclosure -= 1

1392

_end_media_content = _end_enclosure

1393

1394

def _start_media_thumbnail(self,attrsD):

1395

self.push('media:thumbnail',1)

1396

if self.inentry:

1397

if self.inenclosure:

1398

self.entries[-1]['enclosures'][-1]['thumbnail']=FeedParserDict(attrsD)

1399

else:

1400

self.entries[-1]['thumbnail'] = FeedParserDict(attrsD)

1401

1402

def _end_media_thumbnail(self):

1403

self.pop('media:thumbnail')

1404

1405

def _start_media_text(self,attrsD):

1406

self.push('media:text',1)

1407

1408

def _end_media_text(self):

1409

value = self.pop('media:text')

1410

if self.inentry:

1411

if self.inenclosure:

1412

self.entries[-1]['enclosures'][-1]['text'] = value

1413

else:

1414

self.entries[-1]['text'] = value

1415

1416

def _start_media_people(self,attrsD):

1417

self.push('media:people',1)

1418

try:

1419

self.peoplerole = attrsD['role']

1420

except (SystemExit, KeyboardInterrupt):

1421

raise

1422

except:

1423

self.peoplerole = 'unknown'

1424

1425

def _end_media_people(self):

1426

value = self.pop('media:people').split('|')

1427

if self.inentry:

1428

if self.inenclosure:

1429

self.entries[-1]['enclosures'][-1].setdefault('roles', {})

1430

self.entries[-1]['enclosures'][-1].roles[self.peoplerole]=value

1431

else:

1432

self.entries[-1].setdefault('roles', {})

1433

self.entries[-1].roles[self.peoplerole]=value

1434

1435

def _start_dtv_startnback(self,attrsD):

1436

self.push('dtv:startnback',1)

1437

1438

def _end_dtv_startnback(self):

1439

self.feeddata['startnback'] = self.pop('dtv:startnback')

1440

1441

def _start_dtv_librarylink(self,attrsD):

1442

self.push('dtv:librarylink',1)

1443

1444

def _end_dtv_librarylink(self):

1445

self.feeddata['librarylink'] = self.pop('dtv:librarylink')

1446

1447

def _start_dtv_releasedate(self,attrsD):

1448

self.push('dtv:releasedate',1)

1449

1450

def _end_dtv_releasedate(self):

1451

value = self.pop('dtv:releasedate')

1452

if self.inentry:

1453

if self.inenclosure:

1454

self.entries[-1]['enclosures'][-1]['releasedate'] = value

1455

self.entries[-1]['enclosures'][-1]['releasedate_parsed'] = _parse_date(value)

1456

else:

1457

self.entries[-1]['releasedate'] = value

1458

self.entries[-1]['releasedate_parsed'] = _parse_date(value)

1459

1460

def _start_dtv_paymentlink(self,attrsD):

1461

self.incontent += 1

1462

self.contentparams['mode'] = 'xml'

1463

self.contentparams['type'] = 'application/xhtml+xml'

1464

self.push('dtv:paymentlink',1)

1465

if self.inentry:

1466

if attrsD.has_key('url'):

1467

if self.inenclosure:

1468

self.entries[-1]['enclosures'][-1]['payment_url'] = attrsD['url']

1469

else:

1470

self.entries[-1]['payment_url'] = attrsD['url']

1471

1472

def _end_dtv_paymentlink(self):

1473

value = sanitizeHTML(self.pop('dtv:paymentlink'),self.encoding)

1474

self.incontent -= 1

1475

self.contentparams.clear()

1476

if self.inentry:

1477

if self.inenclosure:

1478

self.entries[-1]['enclosures'][-1]['payment_html'] = value

1479

else:

1480

self.entries[-1]['payment_html'] = value

1481

1482

def _start_source(self, attrsD):

1483

self.insource = 1

1484

1485

def _end_source(self):

1486

self.insource = 0

1487

self._getContext()['source'] = copy.deepcopy(self.sourcedata)

1488

self.sourcedata.clear()

1489

1490

def _start_content(self, attrsD):

1491

self.pushContent('content', attrsD, 'text/plain', 1)

1492

src = attrsD.get('src')

1493

if src:

1494

self.contentparams['src'] = src

1495

self.push('content', 1)

1496

1497

def _start_prodlink(self, attrsD):

1498

self.pushContent('content', attrsD, 'text/html', 1)

1499

1500

def _start_body(self, attrsD):

1501

self.pushContent('content', attrsD, 'application/xhtml+xml', 1)

1502

_start_xhtml_body = _start_body

1503

1504

def _start_content_encoded(self, attrsD):

1505

self.pushContent('content', attrsD, 'text/html', 1)

1506

_start_fullitem = _start_content_encoded

1507

1508

def _end_content(self):

1509

copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types)

1510

value = self.popContent('content')

1511

if copyToDescription:

1512

self._save('description', value)

1513

_end_body = _end_content

1514

_end_xhtml_body = _end_content

1515

_end_content_encoded = _end_content

1516

_end_fullitem = _end_content

1517

_end_prodlink = _end_content

1518

1519

def _start_itunes_image(self, attrsD):

1520

self.push('itunes_image', 0)

1521

self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')})

1522

1523

def _start_itunes_link(self, attrsD):

1524

self.push('itunes_link', 0)

1525

self._getContext()['link'] = FeedParserDict({'href': attrsD.get('href')})

1526

1527

def _end_itunes_block(self):

1528

value = self.pop('itunes_block', 0)

1529

self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0

1530

1531

def _end_itunes_explicit(self):

1532

value = self.pop('itunes_explicit', 0)

1533

self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0

1534

1535

if _XML_AVAILABLE:

1536

class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler):

1537

def __init__(self, baseuri, baselang, encoding):

1538

if _debug: sys.stderr.write('trying StrictFeedParser\n')

1539

xml.sax.handler.ContentHandler.__init__(self)

1540

_FeedParserMixin.__init__(self, baseuri, baselang, encoding)

1541

self.bozo = 0

1542

self.exc = None

1543

1544

def startPrefixMapping(self, prefix, uri):

1545

self.trackNamespace(prefix, uri)

1546

1547

def startElementNS(self, name, qname, attrs):

1548

namespace, localname = name

1549

lowernamespace = str(namespace or '').lower()

1550

if lowernamespace.find('backend.userland.com/rss') <> -1:

1551

# match any backend.userland.com namespace

1552

namespace = 'http://backend.userland.com/rss'

1553

lowernamespace = namespace

1554

if qname and qname.find(':') > 0:

1555

givenprefix = qname.split(':')[0]

1556

else:

1557

givenprefix = None

1558

prefix = self._matchnamespaces.get(lowernamespace, givenprefix)

1559

if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix):

1560

raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix

1561

if prefix:

1562

localname = prefix + ':' + localname

1563

localname = str(localname).lower()

1564

if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname))

1565

1566

# qname implementation is horribly broken in Python 2.1 (it

1567

# doesn't report any), and slightly broken in Python 2.2 (it

1568

# doesn't report the xml: namespace). So we match up namespaces

1569

# with a known list first, and then possibly override them with

1570

# the qnames the SAX parser gives us (if indeed it gives us any

1571

# at all). Thanks to MatejC for helping me test this and

1572

# tirelessly telling me that it didn't work yet.

1573

attrsD = {}

1574

for (namespace, attrlocalname), attrvalue in attrs._attrs.items():

1575

lowernamespace = (namespace or '').lower()

1576

prefix = self._matchnamespaces.get(lowernamespace, '')

1577

if prefix:

1578

attrlocalname = prefix + ':' + attrlocalname

1579

attrsD[str(attrlocalname).lower()] = attrvalue

1580

for qname in attrs.getQNames():

1581

attrsD[str(qname).lower()] = attrs.getValueByQName(qname)

1582

self.unknown_starttag(localname, attrsD.items())

1583

1584

def characters(self, text):

1585

self.handle_data(text)

1586

1587

def endElementNS(self, name, qname):

1588

namespace, localname = name

1589

lowernamespace = str(namespace or '').lower()

1590

if qname and qname.find(':') > 0:

1591

givenprefix = qname.split(':')[0]

1592

else:

1593

givenprefix = ''

1594

prefix = self._matchnamespaces.get(lowernamespace, givenprefix)

1595

if prefix:

1596

localname = prefix + ':' + localname

1597

localname = str(localname).lower()

1598

self.unknown_endtag(localname)

1599

1600

def error(self, exc):

1601

self.bozo = 1

1602

self.exc = exc

1603

1604

def fatalError(self, exc):

1605

self.error(exc)

1606

raise exc

1607

1608

class _BaseHTMLProcessor(sgmllib.SGMLParser):

1609

elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',

1610

'img', 'input', 'isindex', 'link', 'meta', 'param']

1611

1612

def __init__(self, encoding):

1613

self.encoding = encoding

1614

if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding)

1615

sgmllib.SGMLParser.__init__(self)

1616

1617

def reset(self):

1618

self.pieces = []

1619

sgmllib.SGMLParser.reset(self)

1620

1621

def _shorttag_replace(self, match):

1622

tag = match.group(1)

1623

if tag in self.elements_no_end_tag:

1624

return '<' + tag + ' />'

1625

else:

1626

return '<' + tag + '></' + tag + '>'

1627

1628

def feed(self, data):

1629

data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data)

1630

#data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace

1631

data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data)

1632

data = data.replace(''', "'")

1633

data = data.replace('"', '"')

1634

if self.encoding and type(data) == type(u''):

1635

data = data.encode(self.encoding)

1636

sgmllib.SGMLParser.feed(self, data)

1637

1638

def normalize_attrs(self, attrs):

1639

# utility method to be called by descendants

1640

attrs = [(k.lower(), v) for k, v in attrs]

1641

attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs]

1642

return attrs

1643

1644

def parse_starttag(self, i):

1645

retval = sgmllib.SGMLParser.parse_starttag(self, i)

1646

try:

1647

if self.get_starttag_text()[-2:] == "/>":

1648

self.finish_endtag(self.lasttag)

1649

except (SystemExit, KeyboardInterrupt):

1650

raise

1651

except:

1652

pass

1653

return retval

1654

1655

def unknown_starttag(self, tag, attrs):

1656

# called for each start tag

1657

# attrs is a list of (attr, value) tuples

1658

# e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')]

1659

if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag)

1660

uattrs = []

1661

# thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds

1662

for key, value in attrs:

1663

if type(value) != type(u''):

1664

value = unicode(value, self.encoding)

1665

uattrs.append((unicode(key, self.encoding), value))

1666

strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding)

1667

if tag in self.elements_no_end_tag:

1668

self.pieces.append('<%(tag)s%(strattrs)s />' % locals())

1669

else:

1670

self.pieces.append('<%(tag)s%(strattrs)s>' % locals())

1671

1672

def unknown_endtag(self, tag):

1673

# called for each end tag, e.g. for </pre>, tag will be 'pre'

1674

# Reconstruct the original end tag.

1675

if tag not in self.elements_no_end_tag:

1676

self.pieces.append("</%(tag)s>" % locals())

1677

1678

def handle_charref(self, ref):

1679

# called for each character reference, e.g. for ' ', ref will be '160'

1680

# Reconstruct the original character reference.

1681

self.pieces.append('&#%(ref)s;' % locals())

1682

1683

def handle_entityref(self, ref):

1684

# called for each entity reference, e.g. for '©', ref will be 'copy'

1685

# Reconstruct the original entity reference.

1686

self.pieces.append('&%(ref)s;' % locals())

1687

1688

def handle_data(self, text):

1689

# called for each block of plain text, i.e. outside of any tag and

1690

# not containing any character or entity references

1691

# Store the original text verbatim.

1692

if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text)

1693

self.pieces.append(text)

1694

1695

def handle_comment(self, text):

1696

# called for each HTML comment, e.g.

1697

# Reconstruct the original comment.

1698

self.pieces.append('' % locals())

1699

1700

def handle_pi(self, text):

1701

# called for each processing instruction, e.g. <?instruction>

1702

# Reconstruct original processing instruction.

1703

self.pieces.append('<?%(text)s>' % locals())

1704

1705

def handle_decl(self, text):

1706

# called for the DOCTYPE, if present, e.g.

1707

# <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"

1708

# "http://www.w3.org/TR/html4/loose.dtd">

1709

# Reconstruct original DOCTYPE

1710

self.pieces.append('<!%(text)s>' % locals())

1711

1712

_new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match

1713

def _scan_name(self, i, declstartpos):

1714

rawdata = self.rawdata

1715

n = len(rawdata)

1716

if i == n:

1717

return None, -1

1718

m = self._new_declname_match(rawdata, i)

1719

if m:

1720

s = m.group()

1721

name = s.strip()

1722

if (i + len(s)) == n:

1723

return None, -1 # end of buffer

1724

return name.lower(), m.end()

1725

else:

1726

self.handle_data(rawdata)

1727

# self.updatepos(declstartpos, i)

1728

return None, -1

1729

1730

def output(self):

1731

'''Return processed HTML as a single string'''

1732

return ''.join([str(p) for p in self.pieces])

1733

1734

class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor):

1735

def __init__(self, baseuri, baselang, encoding):

1736

sgmllib.SGMLParser.__init__(self)

1737

_FeedParserMixin.__init__(self, baseuri, baselang, encoding)

1738

1739

def decodeEntities(self, element, data):

1740

data = data.replace('<', '<')

1741

data = data.replace('<', '<')

1742

data = data.replace('>', '>')

1743

data = data.replace('>', '>')

1744

data = data.replace('&', '&')

1745

data = data.replace('&', '&')

1746

data = data.replace('"', '"')

1747

data = data.replace('"', '"')

1748

data = data.replace(''', ''')

1749

data = data.replace(''', ''')

1750

if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'):

1751

data = data.replace('<', '<')

1752

data = data.replace('>', '>')

1753

data = data.replace('&', '&')

1754

data = data.replace('"', '"')

1755

data = data.replace(''', "'")

1756

return data

1757

1758

class _RelativeURIResolver(_BaseHTMLProcessor):

1759

relative_uris = [('a', 'href'),

1760

('applet', 'codebase'),

1761

('area', 'href'),

1762

('blockquote', 'cite'),

1763

('body', 'background'),

1764

('del', 'cite'),

1765

('form', 'action'),

1766

('frame', 'longdesc'),

1767

('frame', 'src'),

1768

('iframe', 'longdesc'),

1769

('iframe', 'src'),

1770

('head', 'profile'),

1771

('img', 'longdesc'),

1772

('img', 'src'),

1773

('img', 'usemap'),

1774

('input', 'src'),

1775

('input', 'usemap'),

1776

('ins', 'cite'),

1777

('link', 'href'),

1778

('object', 'classid'),

1779

('object', 'codebase'),

1780

('object', 'data'),

1781

('object', 'usemap'),

1782

('q', 'cite'),

1783

('script', 'src')]

1784

1785

def __init__(self, baseuri, encoding):

1786

_BaseHTMLProcessor.__init__(self, encoding)

1787

self.baseuri = baseuri

1788

1789

def resolveURI(self, uri):

1790

return _urljoin(self.baseuri, uri)

1791

1792

def unknown_starttag(self, tag, attrs):

1793

attrs = self.normalize_attrs(attrs)

1794

attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs]

1795

_BaseHTMLProcessor.unknown_starttag(self, tag, attrs)

1796

1797

def _resolveRelativeURIs(htmlSource, baseURI, encoding):

1798

if _debug: sys.stderr.write('entering _resolveRelativeURIs\n')

1799

p = _RelativeURIResolver(baseURI, encoding)

1800

p.feed(htmlSource)

1801

return p.output()

1802

1803

class _HTMLSanitizer(_BaseHTMLProcessor):

1804

acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big',

1805

'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col',

1806

'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset',

1807

'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input',

1808

'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup',

1809

'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike',

1810

'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th',

1811

'thead', 'tr', 'tt', 'u', 'ul', 'var']

1812

1813

acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',

1814

'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing',

1815

'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols',

1816

'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled',

1817

'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace',

1818

'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',

1819

'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',

1820

'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',

1821

'span', 'src', 'start', 'summary', 'tabindex', 'title', 'type',

1822

'usemap', 'valign', 'value', 'vspace', 'width']

1823

1824

unacceptable_elements_with_end_tag = ['script', 'applet']

1825

1826

def reset(self):

1827

_BaseHTMLProcessor.reset(self)

1828

self.unacceptablestack = 0

1829

1830

def unknown_starttag(self, tag, attrs):

1831

if not tag in self.acceptable_elements:

1832

if tag in self.unacceptable_elements_with_end_tag:

1833

self.unacceptablestack += 1

1834

return

1835

attrs = self.normalize_attrs(attrs)

1836

attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes]

1837

_BaseHTMLProcessor.unknown_starttag(self, tag, attrs)

1838

1839

def unknown_endtag(self, tag):

1840

if not tag in self.acceptable_elements:

1841

if tag in self.unacceptable_elements_with_end_tag:

1842

self.unacceptablestack -= 1

1843

return

1844

_BaseHTMLProcessor.unknown_endtag(self, tag)

1845

1846

def handle_pi(self, text):

1847

pass

1848

1849

def handle_decl(self, text):

1850

pass

1851

1852

def handle_data(self, text):

1853

if not self.unacceptablestack:

1854

_BaseHTMLProcessor.handle_data(self, text)

1855

1856

def sanitizeHTML(htmlSource, encoding):

1857

p = _HTMLSanitizer(encoding)

1858

p.feed(htmlSource)

1859

data = p.output()

1860

if TIDY_MARKUP:

1861

# loop through list of preferred Tidy interfaces looking for one that's installed,

1862

# then set up a common _tidy function to wrap the interface-specific API.

1863

_tidy = None

1864

for tidy_interface in PREFERRED_TIDY_INTERFACES:

1865

try:

1866

if tidy_interface == "uTidy":

1867

from tidy import parseString as _utidy

1868

def _tidy(data, **kwargs):

1869

return str(_utidy(data, **kwargs))

1870

break

1871

elif tidy_interface == "mxTidy":

1872

from mx.Tidy import Tidy as _mxtidy

1873

def _tidy(data, **kwargs):

1874

nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)

1875

return data

1876

break

1877

except (SystemExit, KeyboardInterrupt):

1878

raise

1879

except:

1880

pass

1881

if _tidy:

1882

utf8 = type(data) == type(u'')

1883

if utf8:

1884

data = data.encode('utf-8')

1885

data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")

1886

if utf8:

1887

data = unicode(data, 'utf-8')

1888

if data.count('<body'):

1889

data = data.split('<body', 1)[1]

1890

if data.count('>'):

1891

data = data.split('>', 1)[1]

1892

if data.count('</body'):

1893

data = data.split('</body', 1)[0]

1894

data = data.strip().replace('\r\n', '\n')

1895

return data

1896

1897

class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):

1898

def http_error_default(self, req, fp, code, msg, headers):

1899

if ((code / 100) == 3) and (code != 304):

1900

return self.http_error_302(req, fp, code, msg, headers)

1901

infourl = urllib.addinfourl(fp, headers, req.get_full_url())

1902

infourl.status = code

1903

return infourl

1904

1905

def http_error_302(self, req, fp, code, msg, headers):

1906

if headers.dict.has_key('location'):

1907

infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)

1908

else:

1909

infourl = urllib.addinfourl(fp, headers, req.get_full_url())

1910

if not hasattr(infourl, 'status'):

1911

infourl.status = code

1912

return infourl

1913

1914

def http_error_301(self, req, fp, code, msg, headers):

1915

if headers.dict.has_key('location'):

1916

infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers)

1917

else:

1918

infourl = urllib.addinfourl(fp, headers, req.get_full_url())

1919

if not hasattr(infourl, 'status'):

1920

infourl.status = code

1921

return infourl

1922

1923

http_error_300 = http_error_302

1924

http_error_303 = http_error_302

1925

http_error_307 = http_error_302

1926

1927

def http_error_401(self, req, fp, code, msg, headers):

1928

# Check if

1929

# - server requires digest auth, AND

1930

# - we tried (unsuccessfully) with basic auth, AND

1931

# - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions)

1932

# If all conditions hold, parse authentication information

1933

# out of the Authorization header we sent the first time

1934

# (for the username and password) and the WWW-Authenticate

1935

# header the server sent back (for the realm) and retry

1936

# the request with the appropriate digest auth headers instead.

1937

# This evil genius hack has been brought to you by Aaron Swartz.

1938

host = urlparse.urlparse(req.get_full_url())[1]

1939

try:

1940

assert sys.version.split()[0] >= '2.3.3'

1941

assert base64 != None

1942

user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':')

1943

realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0]

1944

self.add_password(realm, host, user, passw)

1945

retry = self.http_error_auth_reqed('www-authenticate', host, req, headers)

1946

self.reset_retry_count()

1947

return retry

1948

except (SystemExit, KeyboardInterrupt):

1949

raise

1950

except:

1951

return self.http_error_default(req, fp, code, msg, headers)

1952

1953

def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers):

1954

"""URL, filename, or string --> stream

1955

1956

This function lets you define parsers that take any input source

1957

(URL, pathname to local or network file, or actual data as a string)

1958

and deal with it in a uniform manner. Returned object is guaranteed

1959

to have all the basic stdio read methods (read, readline, readlines).

1960

Just .close() the object when you're done with it.

1961

1962

If the etag argument is supplied, it will be used as the value of an

1963

If-None-Match request header.

1964

1965

If the modified argument is supplied, it must be a tuple of 9 integers

1966

as returned by gmtime() in the standard Python time module. This MUST

1967

be in GMT (Greenwich Mean Time). The formatted date/time will be used

1968

as the value of an If-Modified-Since request header.

1969

1970

If the agent argument is supplied, it will be used as the value of a

1971

User-Agent request header.

1972

1973

If the referrer argument is supplied, it will be used as the value of a

1974

Referer[sic] request header.

1975

1976

If handlers is supplied, it is a list of handlers used to build a

1977

urllib2 opener.

1978

"""

1979

1980

if hasattr(url_file_stream_or_string, 'read'):

1981

return url_file_stream_or_string

1982

1983

if url_file_stream_or_string == '-':

1984

return sys.stdin

1985

1986

if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'):

1987

if not agent:

1988

agent = USER_AGENT

1989

# test for inline user:password for basic auth

1990

auth = None

1991

if base64:

1992

urltype, rest = urllib.splittype(url_file_stream_or_string)

1993

realhost, rest = urllib.splithost(rest)

1994

if realhost:

1995

user_passwd, realhost = urllib.splituser(realhost)

1996

if user_passwd:

1997

url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest)

1998

auth = base64.encodestring(user_passwd).strip()

1999

# try to open with urllib2 (to use optional headers)

2000

request = urllib2.Request(url_file_stream_or_string)

2001

request.add_header('User-Agent', agent)

2002

if etag:

2003

request.add_header('If-None-Match', etag)

2004

if modified:

2005

# format into an RFC 1123-compliant timestamp. We can't use

2006

# time.strftime() since the %a and %b directives can be affected

2007

# by the current locale, but RFC 2616 states that dates must be

2008

# in English.

2009

short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

2010

months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

2011

request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5]))

2012

if referrer:

2013

request.add_header('Referer', referrer)

2014

if gzip and zlib:

2015

request.add_header('Accept-encoding', 'gzip, deflate')

2016

elif gzip:

2017

request.add_header('Accept-encoding', 'gzip')

2018

elif zlib:

2019

request.add_header('Accept-encoding', 'deflate')

2020

else:

2021

request.add_header('Accept-encoding', '')

2022

if auth:

2023

request.add_header('Authorization', 'Basic %s' % auth)

2024

if ACCEPT_HEADER:

2025

request.add_header('Accept', ACCEPT_HEADER)

2026

request.add_header('A-IM', 'feed') # RFC 3229 support

2027

opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers))

2028

opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent

2029

try:

2030

return opener.open(request)

2031

finally:

2032

opener.close() # JohnD

2033

2034

# try to open with native open function (if url_file_stream_or_string is a filename)

2035

try:

2036

return open(url_file_stream_or_string)

2037

except (SystemExit, KeyboardInterrupt):

2038

raise

2039

except:

2040

pass

2041

2042

# treat url_file_stream_or_string as string

2043

return _StringIO(str(url_file_stream_or_string))

2044

2045

_date_handlers = []

2046

def registerDateHandler(func):

2047

'''Register a date handler function (takes string, returns 9-tuple date in GMT)'''

2048

_date_handlers.insert(0, func)

2049

2050

# ISO-8601 date parsing routines written by Fazal Majid.

2051

# The ISO 8601 standard is very convoluted and irregular - a full ISO 8601

2052

# parser is beyond the scope of feedparser and would be a worthwhile addition

2053

# to the Python library.

2054

# A single regular expression cannot parse ISO 8601 date formats into groups

2055

# as the standard is highly irregular (for instance is 030104 2003-01-04 or

2056

# 0301-04-01), so we use templates instead.

2057

# Please note the order in templates is significant because we need a

2058

# greedy match.

2059

_iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO',

2060

'YY-?MM-?DD', 'YY-?OOO', 'YYYY',

2061

'-YY-?MM', '-OOO', '-YY',

2062

'--MM-?DD', '--MM',

2063

'---DD',

2064

'CC', '']

2065

_iso8601_re = [

2066

tmpl.replace(

2067

'YYYY', r'(?P<year>\d{4})').replace(

2068

'YY', r'(?P<year>\d\d)').replace(

2069

'MM', r'(?P<month>[01]\d)').replace(

2070

'DD', r'(?P<day>[0123]\d)').replace(

2071

'OOO', r'(?P<ordinal>[0123]\d\d)').replace(

2072

'CC', r'(?P<century>\d\d$)')

2073

+ r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})'

2074

+ r'(:(?P<second>\d{2}))?'

2075

+ r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?'

2076

for tmpl in _iso8601_tmpl]

2077

del tmpl

2078

_iso8601_matches = [re.compile(regex).match for regex in _iso8601_re]

2079

del regex

2080

def _parse_date_iso8601(dateString):

2081

'''Parse a variety of ISO-8601-compatible formats like 20040105'''

2082

m = None

2083

for _iso8601_match in _iso8601_matches:

2084

m = _iso8601_match(dateString)

2085

if m: break

2086

if not m: return

2087

if m.span() == (0, 0): return

2088

params = m.groupdict()

2089

ordinal = params.get('ordinal', 0)

2090

if ordinal:

2091

ordinal = int(ordinal)

2092

else:

2093

ordinal = 0

2094

year = params.get('year', '--')

2095

if not year or year == '--':

2096

year = time.gmtime()[0]

2097

elif len(year) == 2:

2098

# ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993

2099

year = 100 * int(time.gmtime()[0] / 100) + int(year)

2100

else:

2101

year = int(year)

2102

month = params.get('month', '-')

2103

if not month or month == '-':

2104

# ordinals are NOT normalized by mktime, we simulate them

2105

# by setting month=1, day=ordinal

2106

if ordinal:

2107

month = 1

2108

else:

2109

month = time.gmtime()[1]

2110

month = int(month)

2111

day = params.get('day', 0)

2112

if not day:

2113

# see above

2114

if ordinal:

2115

day = ordinal

2116

elif params.get('century', 0) or \

2117

params.get('year', 0) or params.get('month', 0):

2118

day = 1

2119

else:

2120

day = time.gmtime()[2]

2121

else:

2122

day = int(day)

2123

# special case of the century - is the first year of the 21st century

2124

# 2000 or 2001 ? The debate goes on...

2125

if 'century' in params.keys():

2126

year = (int(params['century']) - 1) * 100 + 1

2127

# in ISO 8601 most fields are optional

2128

for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']:

2129

if not params.get(field, None):

2130

params[field] = 0

2131

hour = int(params.get('hour', 0))

2132

minute = int(params.get('minute', 0))

2133

second = int(params.get('second', 0))

2134

# weekday is normalized by mktime(), we can ignore it

2135

weekday = 0

2136

# daylight savings is complex, but not needed for feedparser's purposes

2137

# as time zones, if specified, include mention of whether it is active

2138

# (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and

2139

# and most implementations have DST bugs

2140

daylight_savings_flag = 0

2141

tm = [year, month, day, hour, minute, second, weekday,

2142

ordinal, daylight_savings_flag]

2143

# ISO 8601 time zone adjustments

2144

tz = params.get('tz')

2145

if tz and tz != 'Z':

2146

if tz[0] == '-':

2147

tm[3] += int(params.get('tzhour', 0))

2148

tm[4] += int(params.get('tzmin', 0))

2149

elif tz[0] == '+':

2150

tm[3] -= int(params.get('tzhour', 0))

2151

tm[4] -= int(params.get('tzmin', 0))

2152

else:

2153

return None

2154

# Python's time.mktime() is a wrapper around the ANSI C mktime(3c)

2155

# which is guaranteed to normalize d/m/y/h/m/s.

2156

# Many implementations have bugs, but we'll pretend they don't.

2157

return time.localtime(time.mktime(tm))

2158

registerDateHandler(_parse_date_iso8601)

2159

2160

# 8-bit date handling routines written by ytrewq1.

2161

_korean_year = u'\ub144' # b3e2 in euc-kr

2162

_korean_month = u'\uc6d4' # bff9 in euc-kr

2163

_korean_day = u'\uc77c' # c0cf in euc-kr

2164

_korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr

2165

_korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr

2166

2167

_korean_onblog_date_re = \

2168

re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \

2169

(_korean_year, _korean_month, _korean_day))

2170

_korean_nate_date_re = \

2171

re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \

2172

(_korean_am, _korean_pm))

2173

def _parse_date_onblog(dateString):

2174

'''Parse a string according to the OnBlog 8-bit date format'''

2175

m = _korean_onblog_date_re.match(dateString)

2176

if not m: return

2177

w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \

2178

{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\

2179

'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\

2180

'zonediff': '+09:00'}

2181

if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate)

2182

return _parse_date_w3dtf(w3dtfdate)

2183

registerDateHandler(_parse_date_onblog)

2184

2185

def _parse_date_nate(dateString):

2186

'''Parse a string according to the Nate 8-bit date format'''

2187

m = _korean_nate_date_re.match(dateString)

2188

if not m: return

2189

hour = int(m.group(5))

2190

ampm = m.group(4)

2191

if (ampm == _korean_pm):

2192

hour += 12

2193

hour = str(hour)

2194

if len(hour) == 1:

2195

hour = '0' + hour

2196

w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \

2197

{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\

2198

'hour': hour, 'minute': m.group(6), 'second': m.group(7),\

2199

'zonediff': '+09:00'}

2200

if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate)

2201

return _parse_date_w3dtf(w3dtfdate)

2202

registerDateHandler(_parse_date_nate)

2203

2204

_mssql_date_re = \

2205

re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?')

2206

def _parse_date_mssql(dateString):

2207

'''Parse a string according to the MS SQL date format'''

2208

m = _mssql_date_re.match(dateString)

2209

if not m: return

2210

w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \

2211

{'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\

2212

'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\

2213

'zonediff': '+09:00'}

2214

if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate)

2215

return _parse_date_w3dtf(w3dtfdate)

2216

registerDateHandler(_parse_date_mssql)

2217

2218

# Unicode strings for Greek date strings

2219

_greek_months = \

2220

{ \

2221

u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7

2222

u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7

2223

u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7

2224

u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7

2225

u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7

2226

u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7

2227

u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7

2228

u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7

2229

u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7

2230

u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7

2231

u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7

2232

u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7

2233

u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7

2234

u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7

2235

u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7

2236

u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7

2237

u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7

2238

u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7

2239

u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7

2240

}

2241

2242

_greek_wdays = \

2243

{ \

2244

u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7

2245

u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7

2246

u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7

2247

u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7

2248

u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7

2249

u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7

2250

u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7

2251

}

2252

2253

_greek_date_format_re = \

2254

re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)')

2255

2256

def _parse_date_greek(dateString):

2257

'''Parse a string according to a Greek 8-bit date format.'''

2258

m = _greek_date_format_re.match(dateString)

2259

if not m: return

2260

try:

2261

wday = _greek_wdays[m.group(1)]

2262

month = _greek_months[m.group(3)]

2263

except (SystemExit, KeyboardInterrupt):

2264

raise

2265

except:

2266

return

2267

rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \

2268

{'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\

2269

'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\

2270

'zonediff': m.group(8)}

2271

if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date)

2272

return _parse_date_rfc822(rfc822date)

2273

registerDateHandler(_parse_date_greek)

2274

2275

# Unicode strings for Hungarian date strings

2276

_hungarian_months = \

2277

{ \

2278

u'janu\u00e1r': u'01', # e1 in iso-8859-2

2279

u'febru\u00e1ri': u'02', # e1 in iso-8859-2

2280

u'm\u00e1rcius': u'03', # e1 in iso-8859-2

2281

u'\u00e1prilis': u'04', # e1 in iso-8859-2

2282

u'm\u00e1ujus': u'05', # e1 in iso-8859-2

2283

u'j\u00fanius': u'06', # fa in iso-8859-2

2284

u'j\u00falius': u'07', # fa in iso-8859-2

2285

u'augusztus': u'08',

2286

u'szeptember': u'09',

2287

u'okt\u00f3ber': u'10', # f3 in iso-8859-2

2288

u'november': u'11',

2289

u'december': u'12',

2290

}

2291

2292

_hungarian_date_format_re = \

2293

re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))')

2294

2295

def _parse_date_hungarian(dateString):

2296

'''Parse a string according to a Hungarian 8-bit date format.'''

2297

m = _hungarian_date_format_re.match(dateString)

2298

if not m: return

2299

try:

2300

month = _hungarian_months[m.group(2)]

2301

day = m.group(3)

2302

if len(day) == 1:

2303

day = '0' + day

2304

hour = m.group(4)

2305

if len(hour) == 1:

2306

hour = '0' + hour

2307

except (SystemExit, KeyboardInterrupt):

2308

raise

2309

except:

2310

return

2311

w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \

2312

{'year': m.group(1), 'month': month, 'day': day,\

2313

'hour': hour, 'minute': m.group(5),\

2314

'zonediff': m.group(6)}

2315

if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate)

2316

return _parse_date_w3dtf(w3dtfdate)

2317

registerDateHandler(_parse_date_hungarian)

2318

2319

# W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by

2320

# Drake and licensed under the Python license. Removed all range checking

2321

# for month, day, hour, minute, and second, since mktime will normalize

2322

# these later

2323

def _parse_date_w3dtf(dateString):

2324

def __extract_date(m):

2325

year = int(m.group('year'))

2326

if year < 100:

2327

year = 100 * int(time.gmtime()[0] / 100) + int(year)

2328

if year < 1000:

2329

return 0, 0, 0

2330

julian = m.group('julian')

2331

if julian:

2332

julian = int(julian)

2333

month = julian / 30 + 1

2334

day = julian % 30 + 1

2335

jday = None

2336

while jday != julian:

2337

t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0))

2338

jday = time.gmtime(t)[-2]

2339

diff = abs(jday - julian)

2340

if jday > julian:

2341

if diff < day:

2342

day = day - diff

2343

else:

2344

month = month - 1

2345

day = 31

2346

elif jday < julian:

2347

if day + diff < 28:

2348

day = day + diff

2349

else:

2350

month = month + 1

2351

return year, month, day

2352

month = m.group('month')

2353

day = 1

2354

if month is None:

2355

month = 1

2356

else:

2357

month = int(month)

2358

day = m.group('day')

2359

if day:

2360

day = int(day)

2361

else:

2362

day = 1

2363

return year, month, day

2364

2365

def __extract_time(m):

2366

if not m:

2367

return 0, 0, 0

2368

hours = m.group('hours')

2369

if not hours:

2370

return 0, 0, 0

2371

hours = int(hours)

2372

minutes = int(m.group('minutes'))

2373

seconds = m.group('seconds')

2374

if seconds:

2375

seconds = int(seconds)

2376

else:

2377

seconds = 0

2378

return hours, minutes, seconds

2379

2380

def __extract_tzd(m):

2381

'''Return the Time Zone Designator as an offset in seconds from UTC.'''

2382

if not m:

2383

return 0

2384

tzd = m.group('tzd')

2385

if not tzd:

2386

return 0

2387

if tzd == 'Z':

2388

return 0

2389

hours = int(m.group('tzdhours'))

2390

minutes = m.group('tzdminutes')

2391

if minutes:

2392

minutes = int(minutes)

2393

else:

2394

minutes = 0

2395

offset = (hours*60 + minutes) * 60

2396

if tzd[0] == '+':

2397

return -offset

2398

return offset

2399

2400

__date_re = ('(?P<year>\d\d\d\d)'

2401

'(?:(?P<dsep>-|)'

2402

'(?:(?P<julian>\d\d\d)'

2403

'|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?')

2404

__tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)'

2405

__tzd_rx = re.compile(__tzd_re)

2406

__time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)'

2407

'(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?'

2408

+ __tzd_re)

2409

__datetime_re = '%s(?:T%s)?' % (__date_re, __time_re)

2410

__datetime_rx = re.compile(__datetime_re)

2411

m = __datetime_rx.match(dateString)

2412

if (m is None) or (m.group() != dateString): return

2413

gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0)

2414

if gmt[0] == 0: return

2415

return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone)

2416

registerDateHandler(_parse_date_w3dtf)

2417

2418

def _parse_date_rfc822(dateString):

2419

'''Parse an RFC822, RFC1123, RFC2822, or asctime-style date'''

2420

data = dateString.split()

2421

if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames:

2422

del data[0]

2423

if len(data) == 4:

2424

s = data[3]

2425

i = s.find('+')

2426

if i > 0:

2427

data[3:] = [s[:i], s[i+1:]]

2428

else:

2429

data.append('')

2430

dateString = " ".join(data)

2431

if len(data) < 5:

2432

dateString += ' 00:00:00 GMT'

2433

tm = rfc822.parsedate_tz(dateString)

2434

if tm:

2435

return time.gmtime(rfc822.mktime_tz(tm))

2436

# rfc822.py defines several time zones, but we define some extra ones.

2437

# 'ET' is equivalent to 'EST', etc.

2438

_additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800}

2439

rfc822._timezones.update(_additional_timezones)

2440

registerDateHandler(_parse_date_rfc822)

2441

2442

def _parse_date(dateString):

2443

'''Parses a variety of date formats into a 9-tuple in GMT'''

2444

for handler in _date_handlers:

2445

try:

2446

date9tuple = handler(dateString)

2447

if not date9tuple: continue

2448

if len(date9tuple) != 9:

2449

if _debug: sys.stderr.write('date handler function must return 9-tuple\n')

2450

raise ValueError

2451

map(int, date9tuple)

2452

return date9tuple

2453

except Exception, e:

2454

if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e)))

2455

pass

2456

return None

2457

2458

def _getCharacterEncoding(http_headers, xml_data):

2459

'''Get the character encoding of the XML document

2460

2461

http_headers is a dictionary

2462

xml_data is a raw string (not Unicode)

2463

2464

This is so much trickier than it sounds, it's not even funny.

2465

According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type

2466

is application/xml, application/*+xml,

2467

application/xml-external-parsed-entity, or application/xml-dtd,

2468

the encoding given in the charset parameter of the HTTP Content-Type

2469

takes precedence over the encoding given in the XML prefix within the

2470

document, and defaults to 'utf-8' if neither are specified. But, if

2471

the HTTP Content-Type is text/xml, text/*+xml, or

2472

text/xml-external-parsed-entity, the encoding given in the XML prefix

2473

within the document is ALWAYS IGNORED and only the encoding given in

2474

the charset parameter of the HTTP Content-Type header should be

2475

respected, and it defaults to 'us-ascii' if not specified.

2476

2477

Furthermore, discussion on the atom-syntax mailing list with the

2478

author of RFC 3023 leads me to the conclusion that any document

2479

served with a Content-Type of text/* and no charset parameter

2480

must be treated as us-ascii. (We now do this.) And also that it

2481

must always be flagged as non-well-formed. (We now do this too.)

2482

2483

If Content-Type is unspecified (input was local file or non-HTTP source)

2484

or unrecognized (server just got it totally wrong), then go by the

2485

encoding given in the XML prefix of the document and default to

2486

'iso-8859-1' as per the HTTP specification (RFC 2616).

2487

2488

Then, assuming we didn't find a character encoding in the HTTP headers

2489

(and the HTTP Content-type allowed us to look in the body), we need

2490

to sniff the first few bytes of the XML data and try to determine

2491

whether the encoding is ASCII-compatible. Section F of the XML

2492

specification shows the way here:

2493

http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info

2494

2495

If the sniffed encoding is not ASCII-compatible, we need to make it

2496

ASCII compatible so that we can sniff further into the XML declaration

2497

to find the encoding attribute, which will tell us the true encoding.

2498

2499

Of course, none of this guarantees that we will be able to parse the

2500

feed in the declared character encoding (assuming it was declared

2501

correctly, which many are not). CJKCodecs and iconv_codec help a lot;

2502

you should definitely install them if you can.

2503

http://cjkpython.i18n.org/

2504

'''

2505

2506

def _parseHTTPContentType(content_type):

2507

'''takes HTTP Content-Type header and returns (content type, charset)

2508

2509

If no charset is specified, returns (content type, '')

2510

If no content type is specified, returns ('', '')

2511

Both return parameters are guaranteed to be lowercase strings

2512

'''

2513

content_type = content_type or ''

2514

content_type, params = cgi.parse_header(content_type)

2515

return content_type, params.get('charset', '').replace("'", '')

2516

2517

sniffed_xml_encoding = ''

2518

xml_encoding = ''

2519

true_encoding = ''

2520

http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type'))

2521

# Must sniff for non-ASCII-compatible character encodings before

2522

# searching for XML declaration. This heuristic is defined in

2523

# section F of the XML specification:

2524

# http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info

2525

try:

2526

if xml_data[:4] == '\x4c\x6f\xa7\x94':

2527

# EBCDIC

2528

xml_data = _ebcdic_to_ascii(xml_data)

2529

elif xml_data[:4] == '\x00\x3c\x00\x3f':

2530

# UTF-16BE

2531

sniffed_xml_encoding = 'utf-16be'

2532

xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')

2533

elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'):

2534

# UTF-16BE with BOM

2535

sniffed_xml_encoding = 'utf-16be'

2536

xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')

2537

elif xml_data[:4] == '\x3c\x00\x3f\x00':

2538

# UTF-16LE

2539

sniffed_xml_encoding = 'utf-16le'

2540

xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')

2541

elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'):

2542

# UTF-16LE with BOM

2543

sniffed_xml_encoding = 'utf-16le'

2544

xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')

2545

elif xml_data[:4] == '\x00\x00\x00\x3c':

2546

# UTF-32BE

2547

sniffed_xml_encoding = 'utf-32be'

2548

xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')

2549

elif xml_data[:4] == '\x3c\x00\x00\x00':

2550

# UTF-32LE

2551

sniffed_xml_encoding = 'utf-32le'

2552

xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')

2553

elif xml_data[:4] == '\x00\x00\xfe\xff':

2554

# UTF-32BE with BOM

2555

sniffed_xml_encoding = 'utf-32be'

2556

xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')

2557

elif xml_data[:4] == '\xff\xfe\x00\x00':

2558

# UTF-32LE with BOM

2559

sniffed_xml_encoding = 'utf-32le'

2560

xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')

2561

elif xml_data[:3] == '\xef\xbb\xbf':

2562

# UTF-8 with BOM

2563

sniffed_xml_encoding = 'utf-8'

2564

xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')

2565

else:

2566

# ASCII-compatible

2567

pass

2568

xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)

2569

except (SystemExit, KeyboardInterrupt):

2570

raise

2571

except:

2572

xml_encoding_match = None

2573

if xml_encoding_match:

2574

xml_encoding = xml_encoding_match.groups()[0].lower()

2575

if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')):

2576

xml_encoding = sniffed_xml_encoding

2577

acceptable_content_type = 0

2578

application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity')

2579

text_content_types = ('text/xml', 'text/xml-external-parsed-entity')

2580

if (http_content_type in application_content_types) or \

2581

(http_content_type.startswith('application/') and http_content_type.endswith('+xml')):

2582

acceptable_content_type = 1

2583

true_encoding = http_encoding or xml_encoding or 'utf-8'

2584

elif (http_content_type in text_content_types) or \

2585

(http_content_type.startswith('text/')) and http_content_type.endswith('+xml'):

2586

acceptable_content_type = 1

2587

true_encoding = http_encoding or 'us-ascii'

2588

elif http_content_type.startswith('text/'):

2589

true_encoding = http_encoding or 'us-ascii'

2590

elif http_headers and (not http_headers.has_key('content-type')):

2591

true_encoding = xml_encoding or 'iso-8859-1'

2592

else:

2593

true_encoding = xml_encoding or 'utf-8'

2594

return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type

2595

2596

def _toUTF8(data, encoding):

2597

'''Changes an XML data stream on the fly to specify a new encoding

2598

2599

data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already

2600

encoding is a string recognized by encodings.aliases

2601

'''

2602

if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)

2603

# strip Byte Order Mark (if present)

2604

if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'):

2605

if _debug:

2606

sys.stderr.write('stripping BOM\n')

2607

if encoding != 'utf-16be':

2608

sys.stderr.write('trying utf-16be instead\n')

2609

encoding = 'utf-16be'

2610

data = data[2:]

2611

elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'):

2612

if _debug:

2613

sys.stderr.write('stripping BOM\n')

2614

if encoding != 'utf-16le':

2615

sys.stderr.write('trying utf-16le instead\n')

2616

encoding = 'utf-16le'

2617

data = data[2:]

2618

elif data[:3] == '\xef\xbb\xbf':

2619

if _debug:

2620

sys.stderr.write('stripping BOM\n')

2621

if encoding != 'utf-8':

2622

sys.stderr.write('trying utf-8 instead\n')

2623

encoding = 'utf-8'

2624

data = data[3:]

2625

elif data[:4] == '\x00\x00\xfe\xff':

2626

if _debug:

2627

sys.stderr.write('stripping BOM\n')

2628

if encoding != 'utf-32be':

2629

sys.stderr.write('trying utf-32be instead\n')

2630

encoding = 'utf-32be'

2631

data = data[4:]

2632

elif data[:4] == '\xff\xfe\x00\x00':

2633

if _debug:

2634

sys.stderr.write('stripping BOM\n')

2635

if encoding != 'utf-32le':

2636

sys.stderr.write('trying utf-32le instead\n')

2637

encoding = 'utf-32le'

2638

data = data[4:]

2639

newdata = unicode(data, encoding)

2640

if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)

2641

declmatch = re.compile('^<\?xml[^>]*?>')

2642

newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''

2643

if declmatch.search(newdata):

2644

newdata = declmatch.sub(newdecl, newdata)

2645

else:

2646

newdata = newdecl + u'\n' + newdata

2647

return newdata.encode('utf-8')

2648

2649

def _stripDoctype(data):

2650

'''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)

2651

2652

rss_version may be 'rss091n' or None

2653

stripped_data is the same XML document, minus the DOCTYPE

2654

'''

2655

entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE)

2656

data = entity_pattern.sub('', data)

2657

doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE)

2658

doctype_results = doctype_pattern.findall(data)

2659

doctype = doctype_results and doctype_results[0] or ''

2660

if doctype.lower().count('netscape'):

2661

version = 'rss091n'

2662

else:

2663

version = None

2664

data = doctype_pattern.sub('', data)

2665

return version, data

2666

2667

def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):

2668

'''Parse a feed from a URL, file, stream, or string'''

2669

result = FeedParserDict()

2670

result['feed'] = FeedParserDict()

2671

result['entries'] = []

2672

if _XML_AVAILABLE:

2673

result['bozo'] = 0

2674

if type(handlers) == types.InstanceType:

2675

handlers = [handlers]

2676

try:

2677

f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)

2678

data = f.read()

2679

except Exception, e:

2680

result['bozo'] = 1

2681

result['bozo_exception'] = e

2682

data = ''

2683

f = None

2684

2685

# if feed is gzip-compressed, decompress it

2686

if f and data and hasattr(f, 'headers'):

2687

if gzip and f.headers.get('content-encoding', '') == 'gzip':

2688

try:

2689

data = gzip.GzipFile(fileobj=_StringIO(data)).read()

2690

except Exception, e:

2691

# Some feeds claim to be gzipped but they're not, so

2692

# we get garbage. Ideally, we should re-request the

2693

# feed without the 'Accept-encoding: gzip' header,

2694

# but we don't.

2695

result['bozo'] = 1

2696

result['bozo_exception'] = e

2697

data = ''

2698

elif zlib and f.headers.get('content-encoding', '') == 'deflate':

2699

try:

2700

data = zlib.decompress(data, -zlib.MAX_WBITS)

2701

except Exception, e:

2702

result['bozo'] = 1

2703

result['bozo_exception'] = e

2704

data = ''

2705

2706

# save HTTP headers

2707

if hasattr(f, 'info'):

2708

info = f.info()

2709

result['etag'] = info.getheader('ETag')

2710

last_modified = info.getheader('Last-Modified')

2711

if last_modified:

2712

result['modified'] = _parse_date(last_modified)

2713

if hasattr(f, 'url'):

2714

result['href'] = f.url

2715

result['status'] = 200

2716

if hasattr(f, 'status'):

2717

result['status'] = f.status

2718

if hasattr(f, 'headers'):

2719

result['headers'] = f.headers.dict

2720

if hasattr(f, 'close'):

2721

f.close()

2722

2723

# there are four encodings to keep track of:

2724

# - http_encoding is the encoding declared in the Content-Type HTTP header

2725

# - xml_encoding is the encoding declared in the <?xml declaration

2726

# - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data

2727

# - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications

2728

http_headers = result.get('headers', {})

2729

result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \

2730

_getCharacterEncoding(http_headers, data)

2731

if http_headers and (not acceptable_content_type):

2732

if http_headers.has_key('content-type'):

2733

bozo_message = '%s is not an XML media type' % http_headers['content-type']

2734

else:

2735

bozo_message = 'no Content-type specified'

2736

result['bozo'] = 1

2737

result['bozo_exception'] = NonXMLContentType(bozo_message)

2738

2739

result['version'], data = _stripDoctype(data)

2740

2741

baseuri = http_headers.get('content-location', result.get('href'))

2742

baselang = http_headers.get('content-language', None)

2743

2744

# if server sent 304, we're done

2745

if result.get('status', 0) == 304:

2746

result['version'] = ''

2747

result['debug_message'] = 'The feed has not changed since you last checked, ' + \

2748

'so the server sent no data. This is a feature, not a bug!'

2749

return result

2750

2751

# if there was a problem downloading, we're done

2752

if not data:

2753

return result

2754

2755

# determine character encoding

2756

use_strict_parser = 0

2757

known_encoding = 0

2758

tried_encodings = []

2759

# try: HTTP encoding, declared XML encoding, encoding sniffed from BOM

2760

for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding):

2761

if not proposed_encoding: continue

2762

if proposed_encoding in tried_encodings: continue

2763

tried_encodings.append(proposed_encoding)

2764

try:

2765

data = _toUTF8(data, proposed_encoding)

2766

known_encoding = use_strict_parser = 1

2767

break

2768

except (SystemExit, KeyboardInterrupt):

2769

raise

2770

except:

2771

pass

2772

# if no luck and we have auto-detection library, try that

2773

if (not known_encoding) and chardet:

2774

try:

2775

proposed_encoding = chardet.detect(data)['encoding']

2776

if proposed_encoding and (proposed_encoding not in tried_encodings):

2777

tried_encodings.append(proposed_encoding)

2778

data = _toUTF8(data, proposed_encoding)

2779

known_encoding = use_strict_parser = 1

2780

except (SystemExit, KeyboardInterrupt):

2781

raise

2782

except:

2783

pass

2784

# if still no luck and we haven't tried utf-8 yet, try that

2785

if (not known_encoding) and ('utf-8' not in tried_encodings):

2786

try:

2787

proposed_encoding = 'utf-8'

2788

tried_encodings.append(proposed_encoding)

2789

data = _toUTF8(data, proposed_encoding)

2790

known_encoding = use_strict_parser = 1

2791

except (SystemExit, KeyboardInterrupt):

2792

raise

2793

except:

2794

pass

2795

# if still no luck and we haven't tried windows-1252 yet, try that

2796

if (not known_encoding) and ('windows-1252' not in tried_encodings):

2797

try:

2798

proposed_encoding = 'windows-1252'

2799

tried_encodings.append(proposed_encoding)

2800

data = _toUTF8(data, proposed_encoding)

2801

known_encoding = use_strict_parser = 1

2802

except (SystemExit, KeyboardInterrupt):

2803

raise

2804

except:

2805

pass

2806

# if still no luck, give up

2807

if not known_encoding:

2808

result['bozo'] = 1

2809

result['bozo_exception'] = CharacterEncodingUnknown( \

2810

'document encoding unknown, I tried ' + \

2811

'%s, %s, utf-8, and windows-1252 but nothing worked' % \

2812

(result['encoding'], xml_encoding))

2813

result['encoding'] = ''

2814

elif proposed_encoding != result['encoding']:

2815

result['bozo'] = 1

2816

result['bozo_exception'] = CharacterEncodingOverride( \

2817

'documented declared as %s, but parsed as %s' % \

2818

(result['encoding'], proposed_encoding))

2819

result['encoding'] = proposed_encoding

2820

2821

if not _XML_AVAILABLE:

2822

use_strict_parser = 0

2823

if use_strict_parser:

2824

# initialize the SAX parser

2825

feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8')

2826

saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS)

2827

saxparser.setFeature(xml.sax.handler.feature_namespaces, 1)

2828

saxparser.setContentHandler(feedparser)

2829

saxparser.setErrorHandler(feedparser)

2830

source = xml.sax.xmlreader.InputSource()

2831

source.setByteStream(_StringIO(data))

2832

if hasattr(saxparser, '_ns_stack'):

2833

# work around bug in built-in SAX parser (doesn't recognize xml: namespace)

2834

# PyXML doesn't have this problem, and it doesn't have _ns_stack either

2835

saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'})

2836

try:

2837

saxparser.parse(source)

2838

except Exception, e:

2839

if _debug:

2840

import traceback

2841

traceback.print_stack()

2842

traceback.print_exc()

2843

sys.stderr.write('xml parsing failed\n')

2844

result['bozo'] = 1

2845

result['bozo_exception'] = feedparser.exc or e

2846

use_strict_parser = 0

2847

if not use_strict_parser:

2848

feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '')

2849

feedparser.feed(data)

2850

result['feed'] = feedparser.feeddata

2851

result['entries'] = feedparser.entries

2852

result['version'] = result['version'] or feedparser.version

2853

result['namespaces'] = feedparser.namespacesInUse

2854

return result

2855

2856

if __name__ == '__main__':

2857

if not sys.argv[1:]:

2858

print __doc__

2859

sys.exit(0)

2860

else:

2861

urls = sys.argv[1:]

2862

zopeCompatibilityHack()

2863

from pprint import pprint

2864

for url in urls:

2865

print url

2866

2867

result = parse(url)

2868

pprint(result)

2869

2870

2871

#REVISION HISTORY

2872

#1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,

2873

# added Simon Fell's test suite

2874

#1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections

2875

#2.0 - 10/19/2002

2876

# JD - use inchannel to watch out for image and textinput elements which can

2877

# also contain title, link, and description elements

2878

# JD - check for isPermaLink='false' attribute on guid elements

2879

# JD - replaced openAnything with open_resource supporting ETag and

2880

# If-Modified-Since request headers

2881

# JD - parse now accepts etag, modified, agent, and referrer optional

2882

# arguments

2883

# JD - modified parse to return a dictionary instead of a tuple so that any

2884

# etag or modified information can be returned and cached by the caller

2885

#2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything

2886

# because of etag/modified, return the old etag/modified to the caller to

2887

# indicate why nothing is being returned

2888

#2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its

2889

# useless. Fixes the problem JD was addressing by adding it.

2890

#2.1 - 11/14/2002 - MAP - added gzip support

2891

#2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.

2892

# start_admingeneratoragent is an example of how to handle elements with

2893

# only attributes, no content.

2894

#2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);

2895

# also, make sure we send the User-Agent even if urllib2 isn't available.

2896

# Match any variation of backend.userland.com/rss namespace.

2897

#2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.

2898

#2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's

2899

# snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed

2900

# project name

2901

#2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);

2902

# removed unnecessary urllib code -- urllib2 should always be available anyway;

2903

# return actual url, status, and full HTTP headers (as result['url'],

2904

# result['status'], and result['headers']) if parsing a remote feed over HTTP --

2905

# this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;

2906

# added the latest namespace-of-the-week for RSS 2.0

2907

#2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom

2908

# User-Agent (otherwise urllib2 sends two, which confuses some servers)

2909

#2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for

2910

# inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds

2911

#2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or

2912

# textInput, and also to return the character encoding (if specified)

2913

#2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking

2914

# nested divs within content (JohnD); fixed missing sys import (JohanS);

2915

# fixed regular expression to capture XML character encoding (Andrei);

2916

# added support for Atom 0.3-style links; fixed bug with textInput tracking;

2917

# added support for cloud (MartijnP); added support for multiple

2918

# category/dc:subject (MartijnP); normalize content model: 'description' gets

2919

# description (which can come from description, summary, or full content if no

2920

# description), 'content' gets dict of base/language/type/value (which can come

2921

# from content:encoded, xhtml:body, content, or fullitem);

2922

# fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang

2923

# tracking; fixed bug tracking unknown tags; fixed bug tracking content when

2924

# <content> element is not in default namespace (like Pocketsoap feed);

2925

# resolve relative URLs in link, guid, docs, url, comments, wfw:comment,

2926

# wfw:commentRSS; resolve relative URLs within embedded HTML markup in

2927

# description, xhtml:body, content, content:encoded, title, subtitle,

2928

# summary, info, tagline, and copyright; added support for pingback and

2929

# trackback namespaces

2930

#2.7 - 1/5/2004 - MAP - really added support for trackback and pingback

2931

# namespaces, as opposed to 2.6 when I said I did but didn't really;

2932

# sanitize HTML markup within some elements; added mxTidy support (if

2933

# installed) to tidy HTML markup within some elements; fixed indentation

2934

# bug in _parse_date (FazalM); use socket.setdefaulttimeout if available

2935

# (FazalM); universal date parsing and normalization (FazalM): 'created', modified',

2936

# 'issued' are parsed into 9-tuple date format and stored in 'created_parsed',

2937

# 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'

2938

# and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa

2939

#2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory

2940

# leak not closing url opener (JohnD); added dc:publisher support (MarekK);

2941

# added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)

2942

#2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in

2943

# encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);

2944

# fixed relative URI processing for guid (skadz); added ICBM support; added

2945

# base64 support

2946

#2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many

2947

# blogspot.com sites); added _debug variable

2948

#2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing

2949

#3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);

2950

# added several new supported namespaces; fixed bug tracking naked markup in

2951

# description; added support for enclosure; added support for source; re-added

2952

# support for cloud which got dropped somehow; added support for expirationDate

2953

#3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking

2954

# xml:base URI, one for documents that don't define one explicitly and one for

2955

# documents that define an outer and an inner xml:base that goes out of scope

2956

# before the end of the document

2957

#3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level

2958

#3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version']

2959

# will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;

2960

# added support for creativeCommons:license and cc:license; added support for

2961

# full Atom content model in title, tagline, info, copyright, summary; fixed bug

2962

# with gzip encoding (not always telling server we support it when we do)

2963

#3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail

2964

# (dictionary of 'name', 'url', 'email'); map author to author_detail if author

2965

# contains name + email address

2966

#3.0b8 - 1/28/2004 - MAP - added support for contributor

2967

#3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added

2968

# support for summary

2969

#3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from

2970

# xml.util.iso8601

2971

#3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain

2972

# dangerous markup; fiddled with decodeEntities (not right); liberalized

2973

# date parsing even further

2974

#3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);

2975

# added support to Atom 0.2 subtitle; added support for Atom content model

2976

# in copyright; better sanitizing of dangerous HTML elements with end tags

2977

# (script, frameset)

2978

#3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,

2979

# etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />)

2980

#3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under

2981

# Python 2.1

2982

#3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;

2983

# fixed bug capturing author and contributor URL; fixed bug resolving relative

2984

# links in author and contributor URL; fixed bug resolvin relative links in

2985

# generator URL; added support for recognizing RSS 1.0; passed Simon Fell's

2986

# namespace tests, and included them permanently in the test suite with his

2987

# permission; fixed namespace handling under Python 2.1

2988

#3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)

2989

#3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023

2990

#3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);

2991

# use libxml2 (if available)

2992

#3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author

2993

# name was in parentheses; removed ultra-problematic mxTidy support; patch to

2994

# workaround crash in PyXML/expat when encountering invalid entities

2995

# (MarkMoraes); support for textinput/textInput

2996

#3.0b20 - 4/7/2004 - MAP - added CDF support

2997

#3.0b21 - 4/14/2004 - MAP - added Hot RSS support

2998

#3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in

2999

# results dict; changed results dict to allow getting values with results.key

3000

# as well as results[key]; work around embedded illformed HTML with half

3001

# a DOCTYPE; work around malformed Content-Type header; if character encoding

3002

# is wrong, try several common ones before falling back to regexes (if this

3003

# works, bozo_exception is set to CharacterEncodingOverride); fixed character

3004

# encoding issues in BaseHTMLProcessor by tracking encoding and converting

3005

# from Unicode to raw strings before feeding data to sgmllib.SGMLParser;

3006

# convert each value in results to Unicode (if possible), even if using

3007

# regex-based parsing

3008

#3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain

3009

# high-bit characters in attributes in embedded HTML in description (thanks

3010

# Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in

3011

# FeedParserDict; tweaked FeedParserDict.has_key to return True if asking

3012

# about a mapped key

3013

#3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and

3014

# results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could

3015

# cause the same encoding to be tried twice (even if it failed the first time);

3016

# fixed DOCTYPE stripping when DOCTYPE contained entity declarations;

3017

# better textinput and image tracking in illformed RSS 1.0 feeds

3018

#3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed

3019

# my blink tag tests

3020

#3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that

3021

# failed to parse utf-16 encoded feeds; made source into a FeedParserDict;

3022

# duplicate admin:generatorAgent/@rdf:resource in generator_detail.url;

3023

# added support for image; refactored parse() fallback logic to try other

3024

# encodings if SAX parsing fails (previously it would only try other encodings

3025

# if re-encoding failed); remove unichr madness in normalize_attrs now that

3026

# we're properly tracking encoding in and out of BaseHTMLProcessor; set

3027

# feed.language from root-level xml:lang; set entry.id from rdf:about;

3028

# send Accept header

3029

#3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between

3030

# iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are

3031

# windows-1252); fixed regression that could cause the same encoding to be

3032

# tried twice (even if it failed the first time)

3033

#3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types;

3034

# recover from malformed content-type header parameter with no equals sign

3035

# ('text/xml; charset:iso-8859-1')

3036

#3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities

3037

# to Unicode equivalents in illformed feeds (aaronsw); added and

3038

# passed tests for converting character entities to Unicode equivalents

3039

# in illformed feeds (aaronsw); test for valid parsers when setting

3040

# XML_AVAILABLE; make version and encoding available when server returns

3041

# a 304; add handlers parameter to pass arbitrary urllib2 handlers (like

3042

# digest auth or proxy support); add code to parse username/password

3043

# out of url and send as basic authentication; expose downloading-related

3044

# exceptions in bozo_exception (aaronsw); added __contains__ method to

3045

# FeedParserDict (aaronsw); added publisher_detail (aaronsw)

3046

#3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always

3047

# convert feed to UTF-8 before passing to XML parser; completely revamped

3048

# logic for determining character encoding and attempting XML parsing

3049

# (much faster); increased default timeout to 20 seconds; test for presence

3050

# of Location header on redirects; added tests for many alternate character

3051

# encodings; support various EBCDIC encodings; support UTF-16BE and

3052

# UTF16-LE with or without a BOM; support UTF-8 with a BOM; support

3053

# UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no

3054

# XML parsers are available; added support for 'Content-encoding: deflate';

3055

# send blank 'Accept-encoding: ' header if neither gzip nor zlib modules

3056

# are available

3057

#3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure

3058

# problem tracking xml:base and xml:lang if element declares it, child

3059

# doesn't, first grandchild redeclares it, and second grandchild doesn't;

3060

# refactored date parsing; defined public registerDateHandler so callers

3061

# can add support for additional date formats at runtime; added support

3062

# for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added

3063

# zopeCompatibilityHack() which turns FeedParserDict into a regular

3064

# dictionary, required for Zope compatibility, and also makes command-

3065

# line debugging easier because pprint module formats real dictionaries

3066

# better than dictionary-like objects; added NonXMLContentType exception,

3067

# which is stored in bozo_exception when a feed is served with a non-XML

3068

# media type such as 'text/plain'; respect Content-Language as default

3069

# language if not xml:lang is present; cloud dict is now FeedParserDict;

3070

# generator dict is now FeedParserDict; better tracking of xml:lang,

3071

# including support for xml:lang='' to unset the current language;

3072

# recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default

3073

# namespace; don't overwrite final status on redirects (scenarios:

3074

# redirecting to a URL that returns 304, redirecting to a URL that

3075

# redirects to another URL with a different type of redirect); add

3076

# support for HTTP 303 redirects

3077

#4.0 - MAP - support for relative URIs in xml:base attribute; fixed

3078

# encoding issue with mxTidy (phopkins); preliminary support for RFC 3229;

3079

# support for Atom 1.0; support for iTunes extensions; new 'tags' for

3080

# categories/keywords/etc. as array of dict

3081

# {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0

3082

# terminology; parse RFC 822-style dates with no time; lots of other

3083

# bug fixes

3084

#4.1 - MAP - removed socket timeout; added support for chardet library

Older »