~ubuntu-branches/ubuntu/feisty/rss2email/feisty : revision 4

1

"""html2text: Turn HTML into equivalent Markdown-structured text."""

2

__version__ = "2.2"

3

__author__ = "Aaron Swartz (me@aaronsw.com)"

4

5

__contributors__ = ["Martin 'Joey' Schulze"]

6

7

# TODO:

8

# Support decoded entities with unifiable.

9

# Word wrap.

10

# Fix :s using buffering

11

# Relative URl resolution

12

13

import re, sys, urllib, htmlentitydefs, codecs, StringIO, types

14

import sgmllib

15

sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')

16

17

try: from textwrap import wrap

18

except: pass

19

20

# Use Unicode characters instead of their ascii psuedo-replacements

21

UNICODE_SNOB = 0

22

23

# Put the links after each paragraph instead of at the end.

24

LINKS_EACH_PARAGRAPH = 0

25

26

# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)

27

BODY_WIDTH = 0

28

29

### Entity Nonsense ###

30

31

def name2cp(k):

32

if k == 'apos': return ord("'")

33

if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3

34

return htmlentitydefs.name2codepoint[k]

35

else:

36

k = htmlentitydefs.entitydefs[k]

37

if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1

38

return ord(codecs.latin_1_decode(k)[0])

39

40

unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',

41

'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',

42

'ndash':'-', 'oelig':'oe', 'aelig':'ae',

43

'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',

44

'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',

45

'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',

46

'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',

47

'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}

48

49

unifiable_n = {}

50

51

for k in unifiable.keys():

52

unifiable_n[name2cp(k)] = unifiable[k]

53

54

def charref(name):

55

if name[0] in ['x','X']:

56

c = int(name[1:], 16)

57

else:

58

c = int(name)

59

60

if not UNICODE_SNOB and c in unifiable_n.keys():

61

return unifiable_n[c]

62

else:

63

return unichr(c)

64

65

def entityref(c):

66

if not UNICODE_SNOB and c in unifiable.keys():

67

return unifiable[c]

68

else:

69

try: name2cp(c)

70

except KeyError: return "&" + c

71

else: return unichr(name2cp(c))

72

73

def replaceEntities(s):

74

s = s.group(1)

75

if s[0] == "#":

76

return charref(s[1:])

77

else: return entityref(s)

78

79

r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")

80

def unescape(s):

81

return r_unescape.sub(replaceEntities, s)

82

83

def fixattrs(attrs):

84

# Fix bug in sgmllib.py

85

if not attrs: return attrs

86

newattrs = []

87

for attr in attrs:

88

newattrs.append((attr[0], unescape(attr[1])))

89

return newattrs

90

91

### End Entity Nonsense ###

92

93

def onlywhite(line):

94

"""Return true if the line does only consist of whitespace characters."""

95

for c in line:

96

if c is not ' ' and c is not ' ':

97

return c is ' '

98

return line

99

100

def optwrap(text):

101

"""Wrap all paragraphs in the provided text."""

102

if not BODY_WIDTH:

103

return text

104

105

assert wrap # Requires Python 2.3.

106

result = ''

107

newlines = 0

108

for para in text.split("\n"):

109

if len(para) > 0:

110

if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':

111

for line in wrap(para, BODY_WIDTH):

112

result += line + "\n"

113

result += "\n"

114

newlines = 2

115

else:

116

if not onlywhite(para):

117

result += para + "\n"

118

newlines = 1

119

else:

120

if newlines < 2:

121

result += "\n"

122

newlines += 1

123

return result

124

125

def hn(tag):

126

if tag[0] == 'h' and len(tag) == 2:

127

try:

128

n = int(tag[1])

129

if n in range(1, 10): return n

130

except ValueError: return 0

131

132

class _html2text(sgmllib.SGMLParser):

133

def __init__(self, out=sys.stdout.write):

134

sgmllib.SGMLParser.__init__(self)

135

136

if out is None: self.out = self.outtextf

137

else: self.out = out

138

self.outtext = u''

139

self.quiet = 0

140

self.p_p = 0

141

self.outcount = 0

142

self.start = 1

143

self.space = 0

144

self.a = []

145

self.astack = []

146

self.acount = 0

147

self.list = []

148

self.blockquote = 0

149

self.pre = 0

150

self.startpre = 0

151

self.lastWasNL = 0

152

153

def outtextf(self, s):

154

if type(s) is type(''): s = codecs.utf_8_decode(s)[0]

155

self.outtext += s

156

157

def close(self):

158

sgmllib.SGMLParser.close(self)

159

160

self.pbr()

161

self.o('', 0, 'end')

162

163

return self.outtext

164

165

def handle_charref(self, c):

166

self.o(charref(c))

167

168

def handle_entityref(self, c):

169

self.o(entityref(c))

170

171

def unknown_starttag(self, tag, attrs):

172

self.handle_tag(tag, attrs, 1)

173

174

def unknown_endtag(self, tag):

175

self.handle_tag(tag, None, 0)

176

177

def previousIndex(self, attrs):

178

""" returns the index of certain set of attributes (of a link) in the

179

self.a list

180

181

If the set of attributes is not found, returns None

182

"""

183

if not attrs.has_key('href'): return None

184

185

i = -1

186

for a in self.a:

187

i += 1

188

match = 0

189

190

if a.has_key('href') and a['href'] == attrs['href']:

191

if a.has_key('title') or attrs.has_key('title'):

192

if (a.has_key('title') and attrs.has_key('title') and

193

a['title'] == attrs['title']):

194

match = True

195

else:

196

match = True

197

198

if match: return i

199

200

def handle_tag(self, tag, attrs, start):

201

attrs = fixattrs(attrs)

202

203

if hn(tag):

204

self.p()

205

if start: self.o(hn(tag)*"#" + ' ')

206

207

if tag in ['p', 'div']: self.p()

208

209

if tag == "br" and start: self.o(" \n")

210

211

if tag == "hr" and start:

212

self.p()

213

self.o("* * *")

214

self.p()

215

216

if tag in ["head", "style", 'script']:

217

if start: self.quiet += 1

218

else: self.quiet -= 1

219

220

if tag == "blockquote":

221

if start:

222

self.p(); self.o('> ', 0, 1); self.start = 1

223

self.blockquote += 1

224

else:

225

self.blockquote -= 1

226

self.p()

227

228

if tag in ['em', 'i', 'u']: self.o("_")

229

if tag in ['strong', 'b']: self.o("**")

230

if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``

231

232

if tag == "a":

233

if start:

234

attrsD = {}

235

for (x, y) in attrs: attrsD[x] = y

236

attrs = attrsD

237

if attrs.has_key('href'):

238

self.astack.append(attrs)

239

self.o("[")

240

else:

241

self.astack.append(None)

242

else:

243

if self.astack:

244

a = self.astack.pop()

245

if a:

246

i = self.previousIndex(a)

247

if i is not None:

248

a = self.a[i]

249

else:

250

self.acount += 1

251

a['count'] = self.acount

252

a['outcount'] = self.outcount

253

self.a.append(a)

254

self.o("][" + `a['count']` + "]")

255

256

if tag == "img" and start:

257

attrsD = {}

258

for (x, y) in attrs: attrsD[x] = y

259

attrs = attrsD

260

if attrs.has_key('src'):

261

attrs['href'] = attrs['src']

262

alt = attrs.get('alt', '')

263

alt = re.sub('\n', ' ', alt)

264

i = self.previousIndex(attrs)

265

if i is not None:

266

attrs = self.a[i]

267

else:

268

self.acount += 1

269

attrs['count'] = self.acount

270

attrs['outcount'] = self.outcount

271

self.a.append(attrs)

272

self.o("![")

273

self.o(alt)

274

self.o("]["+`attrs['count']`+"]")

275

276

if tag in ["ol", "ul"]:

277

if start:

278

self.list.append({'name':tag, 'num':0})

279

else:

280

self.list.pop()

281

282

self.p()

283

284

if tag == 'li':

285

if start:

286

self.pbr()

287

if self.list: li = self.list[-1]

288

else: li = {'name':'ul', 'num':0}

289

self.o(" "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.

290

if li['name'] == "ul": self.o("* ")

291

elif li['name'] == "ol":

292

li['num'] += 1

293

self.o(`li['num']`+". ")

294

self.start = 1

295

else:

296

self.pbr()

297

298

if tag in ['tr']: self.pbr()

299

300

if tag == "pre":

301

if start:

302

self.startpre = 1

303

self.pre = 1

304

else:

305

self.pre = 0

306

self.p()

307

308

def pbr(self):

309

if self.p_p == 0: self.p_p = 1

310

311

def p(self): self.p_p = 2

312

313

314

def o(self, data, puredata=0, force=0):

315

if not self.quiet:

316

if puredata and not self.pre:

317

data = re.sub('\s+', ' ', data)

318

if data and data[0] == ' ':

319

self.space = 1

320

data = data[1:]

321

if not data and not force: return

322

323

if self.startpre:

324

self.out(" :") #TODO: not output when already one there

325

self.startpre = 0

326

327

bq = (">" * self.blockquote)

328

if not (force and data and data[0] == ">") and self.blockquote: bq += " "

329

330

if self.pre:

331

bq += " "

332

data = data.replace("\n", "\n"+bq)

333

334

if self.start:

335

self.space = 0

336

self.p_p = 0

337

self.start = 0

338

339

if force == 'end':

340

# It's the end.

341

self.p_p = 0

342

self.out("\n")

343

self.space = 0

344

345

346

if self.p_p:

347

self.out(('\n'+bq)*self.p_p)

348

self.space = 0

349

350

if self.space:

351

if not self.lastWasNL: self.out(' ')

352

self.space = 0

353

354

if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):

355

if force == "end": self.out("\n")

356

357

newa = []

358

for link in self.a:

359

if self.outcount > link['outcount']:

360

self.out(" ["+`link['count']`+"]: " + link['href']) #TODO: base href

361

if link.has_key('title'): self.out(" ("+link['title']+")")

362

self.out("\n")

363

else:

364

newa.append(link)

365

366

if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.

367

368

self.a = newa

369

370

self.p_p = 0

371

self.out(data)

372

self.lastWasNL = data and data[-1] == '\n'

373

self.outcount += 1

374

375

def handle_data(self, data):

376

self.o(data, 1)

377

378

def unknown_decl(self, data): pass

379

380

def html2text_file(html, out=sys.stdout.write):

381

h = _html2text(out)

382

h.feed(html)

383

h.feed("")

384

return h.close()

385

386

def html2text(html):

387

return optwrap(html2text_file(html, None))

388

389

if __name__ == "__main__":

390

if sys.argv[1:]:

391

arg = sys.argv[1]

392

if arg.startswith('http://'):

393

data = urllib.urlopen(arg).read()

394

else:

395

data = open(arg, 'r').read()

396

else:

397

data = sys.stdin.read()

398

html2text_file(data)

1

"""html2text: Turn HTML into equivalent Markdown-structured text."""

2

__version__ = "2.24"

3

__author__ = "Aaron Swartz (me@aaronsw.com)"

4

5

__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes"]

6

7

# TODO:

8

# Support decoded entities with unifiable.

9

# Relative URL resolution

10

11

if not hasattr(__builtins__, 'True'): True, False = 1, 0

12

import re, sys, urllib, htmlentitydefs, codecs, StringIO, types

13

import sgmllib

14

sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')

15

16

try: from textwrap import wrap

17

except: pass

18

19

# Use Unicode characters instead of their ascii psuedo-replacements

20

UNICODE_SNOB = 0

21

22

# Put the links after each paragraph instead of at the end.

23

LINKS_EACH_PARAGRAPH = 0

24

25

# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)

26

BODY_WIDTH = 0

27

28

### Entity Nonsense ###

29

30

def name2cp(k):

31

if k == 'apos': return ord("'")

32

if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3

33

return htmlentitydefs.name2codepoint[k]

34

else:

35

k = htmlentitydefs.entitydefs[k]

36

if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1

37

return ord(codecs.latin_1_decode(k)[0])

38

39

unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',

40

'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',

41

'ndash':'-', 'oelig':'oe', 'aelig':'ae',

42

'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',

43

'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',

44

'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',

45

'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',

46

'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}

47

48

unifiable_n = {}

49

50

for k in unifiable.keys():

51

unifiable_n[name2cp(k)] = unifiable[k]

52

53

def charref(name):

54

if name[0] in ['x','X']:

55

c = int(name[1:], 16)

56

else:

57

c = int(name)

58

59

if not UNICODE_SNOB and c in unifiable_n.keys():

60

return unifiable_n[c]

61

else:

62

return unichr(c)

63

64

def entityref(c):

65

if not UNICODE_SNOB and c in unifiable.keys():

66

return unifiable[c]

67

else:

68

try: name2cp(c)

69

except KeyError: return "&" + c

70

else: return unichr(name2cp(c))

71

72

def replaceEntities(s):

73

s = s.group(1)

74

if s[0] == "#":

75

return charref(s[1:])

76

else: return entityref(s)

77

78

r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")

79

def unescape(s):

80

return r_unescape.sub(replaceEntities, s)

81

82

def fixattrs(attrs):

83

# Fix bug in sgmllib.py

84

if not attrs: return attrs

85

newattrs = []

86

for attr in attrs:

87

newattrs.append((attr[0], unescape(attr[1])))

88

return newattrs

89

90

### End Entity Nonsense ###

91

92

def onlywhite(line):

93

"""Return true if the line does only consist of whitespace characters."""

94

for c in line:

95

if c is not ' ' and c is not ' ':

96

return c is ' '

97

return line

98

99

def optwrap(text):

100

"""Wrap all paragraphs in the provided text."""

101

if not BODY_WIDTH:

102

return text

103

104

assert wrap # Requires Python 2.3.

105

result = ''

106

newlines = 0

107

for para in text.split("\n"):

108

if len(para) > 0:

109

if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':

110

for line in wrap(para, BODY_WIDTH):

111

result += line + "\n"

112

result += "\n"

113

newlines = 2

114

else:

115

if not onlywhite(para):

116

result += para + "\n"

117

newlines = 1

118

else:

119

if newlines < 2:

120

result += "\n"

121

newlines += 1

122

return result

123

124

def hn(tag):

125

if tag[0] == 'h' and len(tag) == 2:

126

try:

127

n = int(tag[1])

128

if n in range(1, 10): return n

129

except ValueError: return 0

130

131

class _html2text(sgmllib.SGMLParser):

132

def __init__(self, out=sys.stdout.write):

133

sgmllib.SGMLParser.__init__(self)

134

135

if out is None: self.out = self.outtextf

136

else: self.out = out

137

self.outtext = u''

138

self.quiet = 0

139

self.p_p = 0

140

self.outcount = 0

141

self.start = 1

142

self.space = 0

143

self.a = []

144

self.astack = []

145

self.acount = 0

146

self.list = []

147

self.blockquote = 0

148

self.pre = 0

149

self.startpre = 0

150

self.lastWasNL = 0

151

152

def outtextf(self, s):

153

if type(s) is type(''): s = codecs.utf_8_decode(s, "replace")[0]

154

self.outtext += s

155

156

def close(self):

157

sgmllib.SGMLParser.close(self)

158

159

self.pbr()

160

self.o('', 0, 'end')

161

162

return self.outtext

163

164

def handle_charref(self, c):

165

self.o(charref(c))

166

167

def handle_entityref(self, c):

168

self.o(entityref(c))

169

170

def unknown_starttag(self, tag, attrs):

171

self.handle_tag(tag, attrs, 1)

172

173

def unknown_endtag(self, tag):

174

self.handle_tag(tag, None, 0)

175

176

def previousIndex(self, attrs):

177

""" returns the index of certain set of attributes (of a link) in the

178

self.a list

179

180

If the set of attributes is not found, returns None

181

"""

182

if not attrs.has_key('href'): return None

183

184

i = -1

185

for a in self.a:

186

i += 1

187

match = 0

188

189

if a.has_key('href') and a['href'] == attrs['href']:

190

if a.has_key('title') or attrs.has_key('title'):

191

if (a.has_key('title') and attrs.has_key('title') and

192

a['title'] == attrs['title']):

193

match = True

194

else:

195

match = True

196

197

if match: return i

198

199

def handle_tag(self, tag, attrs, start):

200

attrs = fixattrs(attrs)

201

202

if hn(tag):

203

self.p()

204

if start: self.o(hn(tag)*"#" + ' ')

205

206

if tag in ['p', 'div']: self.p()

207

208

if tag == "br" and start: self.o(" \n")

209

210

if tag == "hr" and start:

211

self.p()

212

self.o("* * *")

213

self.p()

214

215

if tag in ["head", "style", 'script']:

216

if start: self.quiet += 1

217

else: self.quiet -= 1

218

219

if tag == "blockquote":

220

if start:

221

self.p(); self.o('> ', 0, 1); self.start = 1

222

self.blockquote += 1

223

else:

224

self.blockquote -= 1

225

self.p()

226

227

if tag in ['em', 'i', 'u']: self.o("_")

228

if tag in ['strong', 'b']: self.o("**")

229

if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``

230

231

if tag == "a":

232

if start:

233

attrsD = {}

234

for (x, y) in attrs: attrsD[x] = y

235

attrs = attrsD

236

if attrs.has_key('href'):

237

self.astack.append(attrs)

238

self.o("[")

239

else:

240

self.astack.append(None)

241

else:

242

if self.astack:

243

a = self.astack.pop()

244

if a:

245

i = self.previousIndex(a)

246

if i is not None:

247

a = self.a[i]

248

else:

249

self.acount += 1

250

a['count'] = self.acount

251

a['outcount'] = self.outcount

252

self.a.append(a)

253

self.o("][" + `a['count']` + "]")

254

255

if tag == "img" and start:

256

attrsD = {}

257

for (x, y) in attrs: attrsD[x] = y

258

attrs = attrsD

259

if attrs.has_key('src'):

260

attrs['href'] = attrs['src']

261

alt = attrs.get('alt', '')

262

alt = re.sub('\n', ' ', alt)

263

i = self.previousIndex(attrs)

264

if i is not None:

265

attrs = self.a[i]

266

else:

267

self.acount += 1

268

attrs['count'] = self.acount

269

attrs['outcount'] = self.outcount

270

self.a.append(attrs)

271

self.o("![")

272

self.o(alt)

273

self.o("]["+`attrs['count']`+"]")

274

275

if tag == 'dl' and start: self.p()

276

if tag == 'dt' and not start: self.pbr()

277

if tag == 'dd' and start: self.o(' ')

278

if tag == 'dd' and not start: self.pbr()

279

280

if tag in ["ol", "ul"]:

281

if start:

282

self.list.append({'name':tag, 'num':0})

283

elif self.list:

284

if self.list: self.list.pop()

285

286

self.p()

287

288

if tag == 'li':

289

if start:

290

self.pbr()

291

if self.list: li = self.list[-1]

292

else: li = {'name':'ul', 'num':0}

293

self.o(" "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.

294

if li['name'] == "ul": self.o("* ")

295

elif li['name'] == "ol":

296

li['num'] += 1

297

self.o(`li['num']`+". ")

298

self.start = 1

299

else:

300

self.pbr()

301

302

if tag in ['tr']: self.pbr()

303

304

if tag == "pre":

305

if start:

306

self.startpre = 1

307

self.pre = 1

308

else:

309

self.pre = 0

310

self.p()

311

312

def pbr(self):

313

if self.p_p == 0: self.p_p = 1

314

315

def p(self): self.p_p = 2

316

317

def o(self, data, puredata=0, force=0):

318

if not self.quiet:

319

if puredata and not self.pre:

320

data = re.sub('\s+', ' ', data)

321

if data and data[0] == ' ':

322

self.space = 1

323

data = data[1:]

324

if not data and not force: return

325

326

if self.startpre:

327

#self.out(" :") #TODO: not output when already one there

328

self.startpre = 0

329

330

bq = (">" * self.blockquote)

331

if not (force and data and data[0] == ">") and self.blockquote: bq += " "

332

333

if self.pre:

334

bq += " "

335

data = data.replace("\n", "\n"+bq)

336

337

if self.start:

338

self.space = 0

339

self.p_p = 0

340

self.start = 0

341

342

if force == 'end':

343

# It's the end.

344

self.p_p = 0

345

self.out("\n")

346

self.space = 0

347

348

349

if self.p_p:

350

self.out(('\n'+bq)*self.p_p)

351

self.space = 0

352

353

if self.space:

354

if not self.lastWasNL: self.out(' ')

355

self.space = 0

356

357

if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):

358

if force == "end": self.out("\n")

359

360

newa = []

361

for link in self.a:

362

if self.outcount > link['outcount']:

363

self.out(" ["+`link['count']`+"]: " + link['href']) #TODO: base href

364

if link.has_key('title'): self.out(" ("+link['title']+")")

365

self.out("\n")

366

else:

367

newa.append(link)

368

369

if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.

370

371

self.a = newa

372

373

self.p_p = 0

374

self.out(data)

375

self.lastWasNL = data and data[-1] == '\n'

376

self.outcount += 1

377

378

def handle_data(self, data):

379

self.o(data, 1)

380

381

def unknown_decl(self, data): pass

382

383

def html2text_file(html, out=sys.stdout.write):

384

h = _html2text(out)

385

h.feed(html)

386

h.feed("")

387

return h.close()

388

389

def html2text(html):

390

return optwrap(html2text_file(html, None))

391

392

if __name__ == "__main__":

393

if sys.argv[1:]:

394

arg = sys.argv[1]

395

if arg.startswith('http://'):

396

data = urllib.urlopen(arg).read()

397

else:

398

data = open(arg, 'r').read()

399

else:

400

data = sys.stdin.read()

401

html2text_file(data)

402