1
"""html2text: Turn HTML into equivalent Markdown-structured text."""
3
__author__ = "Aaron Swartz (me@aaronsw.com)"
4
__copyright__ = "(C) 2004 Aaron Swartz. GNU GPL 2."
5
__contributors__ = ["Martin 'Joey' Schulze"]
8
# Support decoded entities with unifiable.
10
# Fix :s using buffering
11
# Relative URl resolution
13
import re, sys, urllib, htmlentitydefs, codecs, StringIO, types
15
sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
17
try: from textwrap import wrap
20
# Use Unicode characters instead of their ascii psuedo-replacements
23
# Put the links after each paragraph instead of at the end.
24
LINKS_EACH_PARAGRAPH = 0
26
# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
29
### Entity Nonsense ###
32
if k == 'apos': return ord("'")
33
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
34
return htmlentitydefs.name2codepoint[k]
36
k = htmlentitydefs.entitydefs[k]
37
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
38
return ord(codecs.latin_1_decode(k)[0])
40
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
41
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
42
'ndash':'-', 'oelig':'oe', 'aelig':'ae',
43
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
44
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
45
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
46
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
47
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
51
for k in unifiable.keys():
52
unifiable_n[name2cp(k)] = unifiable[k]
55
if name[0] in ['x','X']:
60
if not UNICODE_SNOB and c in unifiable_n.keys():
66
if not UNICODE_SNOB and c in unifiable.keys():
70
except KeyError: return "&" + c
71
else: return unichr(name2cp(c))
73
def replaceEntities(s):
77
else: return entityref(s)
79
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
81
return r_unescape.sub(replaceEntities, s)
84
# Fix bug in sgmllib.py
85
if not attrs: return attrs
88
newattrs.append((attr[0], unescape(attr[1])))
91
### End Entity Nonsense ###
94
"""Return true if the line does only consist of whitespace characters."""
96
if c is not ' ' and c is not ' ':
101
"""Wrap all paragraphs in the provided text."""
105
assert wrap # Requires Python 2.3.
108
for para in text.split("\n"):
110
if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
111
for line in wrap(para, BODY_WIDTH):
112
result += line + "\n"
116
if not onlywhite(para):
117
result += para + "\n"
126
if tag[0] == 'h' and len(tag) == 2:
129
if n in range(1, 10): return n
130
except ValueError: return 0
132
class _html2text(sgmllib.SGMLParser):
133
def __init__(self, out=sys.stdout.write):
134
sgmllib.SGMLParser.__init__(self)
136
if out is None: self.out = self.outtextf
153
def outtextf(self, s):
154
if type(s) is type(''): s = codecs.utf_8_decode(s)[0]
158
sgmllib.SGMLParser.close(self)
165
def handle_charref(self, c):
168
def handle_entityref(self, c):
171
def unknown_starttag(self, tag, attrs):
172
self.handle_tag(tag, attrs, 1)
174
def unknown_endtag(self, tag):
175
self.handle_tag(tag, None, 0)
177
def previousIndex(self, attrs):
178
""" returns the index of certain set of attributes (of a link) in the
181
If the set of attributes is not found, returns None
183
if not attrs.has_key('href'): return None
190
if a.has_key('href') and a['href'] == attrs['href']:
191
if a.has_key('title') or attrs.has_key('title'):
192
if (a.has_key('title') and attrs.has_key('title') and
193
a['title'] == attrs['title']):
200
def handle_tag(self, tag, attrs, start):
201
attrs = fixattrs(attrs)
205
if start: self.o(hn(tag)*"#" + ' ')
207
if tag in ['p', 'div']: self.p()
209
if tag == "br" and start: self.o(" \n")
211
if tag == "hr" and start:
216
if tag in ["head", "style", 'script']:
217
if start: self.quiet += 1
218
else: self.quiet -= 1
220
if tag == "blockquote":
222
self.p(); self.o('> ', 0, 1); self.start = 1
228
if tag in ['em', 'i', 'u']: self.o("_")
229
if tag in ['strong', 'b']: self.o("**")
230
if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
235
for (x, y) in attrs: attrsD[x] = y
237
if attrs.has_key('href'):
238
self.astack.append(attrs)
241
self.astack.append(None)
244
a = self.astack.pop()
246
i = self.previousIndex(a)
251
a['count'] = self.acount
252
a['outcount'] = self.outcount
254
self.o("][" + `a['count']` + "]")
256
if tag == "img" and start:
258
for (x, y) in attrs: attrsD[x] = y
260
if attrs.has_key('src'):
261
attrs['href'] = attrs['src']
262
alt = attrs.get('alt', '')
263
alt = re.sub('\n', ' ', alt)
264
i = self.previousIndex(attrs)
269
attrs['count'] = self.acount
270
attrs['outcount'] = self.outcount
274
self.o("]["+`attrs['count']`+"]")
276
if tag in ["ol", "ul"]:
278
self.list.append({'name':tag, 'num':0})
287
if self.list: li = self.list[-1]
288
else: li = {'name':'ul', 'num':0}
289
self.o(" "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.
290
if li['name'] == "ul": self.o("* ")
291
elif li['name'] == "ol":
293
self.o(`li['num']`+". ")
298
if tag in ['tr']: self.pbr()
309
if self.p_p == 0: self.p_p = 1
311
def p(self): self.p_p = 2
314
def o(self, data, puredata=0, force=0):
316
if puredata and not self.pre:
317
data = re.sub('\s+', ' ', data)
318
if data and data[0] == ' ':
321
if not data and not force: return
324
self.out(" :") #TODO: not output when already one there
327
bq = (">" * self.blockquote)
328
if not (force and data and data[0] == ">") and self.blockquote: bq += " "
332
data = data.replace("\n", "\n"+bq)
347
self.out(('\n'+bq)*self.p_p)
351
if not self.lastWasNL: self.out(' ')
354
if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
355
if force == "end": self.out("\n")
359
if self.outcount > link['outcount']:
360
self.out(" ["+`link['count']`+"]: " + link['href']) #TODO: base href
361
if link.has_key('title'): self.out(" ("+link['title']+")")
366
if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
372
self.lastWasNL = data and data[-1] == '\n'
375
def handle_data(self, data):
378
def unknown_decl(self, data): pass
380
def html2text_file(html, out=sys.stdout.write):
387
return optwrap(html2text_file(html, None))
389
if __name__ == "__main__":
392
if arg.startswith('http://'):
393
data = urllib.urlopen(arg).read()
395
data = open(arg, 'r').read()
397
data = sys.stdin.read()
1
"""html2text: Turn HTML into equivalent Markdown-structured text."""
3
__author__ = "Aaron Swartz (me@aaronsw.com)"
4
__copyright__ = "(C) 2004 Aaron Swartz. GNU GPL 2."
5
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes"]
8
# Support decoded entities with unifiable.
9
# Relative URL resolution
11
if not hasattr(__builtins__, 'True'): True, False = 1, 0
12
import re, sys, urllib, htmlentitydefs, codecs, StringIO, types
14
sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
16
try: from textwrap import wrap
19
# Use Unicode characters instead of their ascii psuedo-replacements
22
# Put the links after each paragraph instead of at the end.
23
LINKS_EACH_PARAGRAPH = 0
25
# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
28
### Entity Nonsense ###
31
if k == 'apos': return ord("'")
32
if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
33
return htmlentitydefs.name2codepoint[k]
35
k = htmlentitydefs.entitydefs[k]
36
if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
37
return ord(codecs.latin_1_decode(k)[0])
39
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
40
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
41
'ndash':'-', 'oelig':'oe', 'aelig':'ae',
42
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
43
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
44
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
45
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
46
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
50
for k in unifiable.keys():
51
unifiable_n[name2cp(k)] = unifiable[k]
54
if name[0] in ['x','X']:
59
if not UNICODE_SNOB and c in unifiable_n.keys():
65
if not UNICODE_SNOB and c in unifiable.keys():
69
except KeyError: return "&" + c
70
else: return unichr(name2cp(c))
72
def replaceEntities(s):
76
else: return entityref(s)
78
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
80
return r_unescape.sub(replaceEntities, s)
83
# Fix bug in sgmllib.py
84
if not attrs: return attrs
87
newattrs.append((attr[0], unescape(attr[1])))
90
### End Entity Nonsense ###
93
"""Return true if the line does only consist of whitespace characters."""
95
if c is not ' ' and c is not ' ':
100
"""Wrap all paragraphs in the provided text."""
104
assert wrap # Requires Python 2.3.
107
for para in text.split("\n"):
109
if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
110
for line in wrap(para, BODY_WIDTH):
111
result += line + "\n"
115
if not onlywhite(para):
116
result += para + "\n"
125
if tag[0] == 'h' and len(tag) == 2:
128
if n in range(1, 10): return n
129
except ValueError: return 0
131
class _html2text(sgmllib.SGMLParser):
132
def __init__(self, out=sys.stdout.write):
133
sgmllib.SGMLParser.__init__(self)
135
if out is None: self.out = self.outtextf
152
def outtextf(self, s):
153
if type(s) is type(''): s = codecs.utf_8_decode(s, "replace")[0]
157
sgmllib.SGMLParser.close(self)
164
def handle_charref(self, c):
167
def handle_entityref(self, c):
170
def unknown_starttag(self, tag, attrs):
171
self.handle_tag(tag, attrs, 1)
173
def unknown_endtag(self, tag):
174
self.handle_tag(tag, None, 0)
176
def previousIndex(self, attrs):
177
""" returns the index of certain set of attributes (of a link) in the
180
If the set of attributes is not found, returns None
182
if not attrs.has_key('href'): return None
189
if a.has_key('href') and a['href'] == attrs['href']:
190
if a.has_key('title') or attrs.has_key('title'):
191
if (a.has_key('title') and attrs.has_key('title') and
192
a['title'] == attrs['title']):
199
def handle_tag(self, tag, attrs, start):
200
attrs = fixattrs(attrs)
204
if start: self.o(hn(tag)*"#" + ' ')
206
if tag in ['p', 'div']: self.p()
208
if tag == "br" and start: self.o(" \n")
210
if tag == "hr" and start:
215
if tag in ["head", "style", 'script']:
216
if start: self.quiet += 1
217
else: self.quiet -= 1
219
if tag == "blockquote":
221
self.p(); self.o('> ', 0, 1); self.start = 1
227
if tag in ['em', 'i', 'u']: self.o("_")
228
if tag in ['strong', 'b']: self.o("**")
229
if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
234
for (x, y) in attrs: attrsD[x] = y
236
if attrs.has_key('href'):
237
self.astack.append(attrs)
240
self.astack.append(None)
243
a = self.astack.pop()
245
i = self.previousIndex(a)
250
a['count'] = self.acount
251
a['outcount'] = self.outcount
253
self.o("][" + `a['count']` + "]")
255
if tag == "img" and start:
257
for (x, y) in attrs: attrsD[x] = y
259
if attrs.has_key('src'):
260
attrs['href'] = attrs['src']
261
alt = attrs.get('alt', '')
262
alt = re.sub('\n', ' ', alt)
263
i = self.previousIndex(attrs)
268
attrs['count'] = self.acount
269
attrs['outcount'] = self.outcount
273
self.o("]["+`attrs['count']`+"]")
275
if tag == 'dl' and start: self.p()
276
if tag == 'dt' and not start: self.pbr()
277
if tag == 'dd' and start: self.o(' ')
278
if tag == 'dd' and not start: self.pbr()
280
if tag in ["ol", "ul"]:
282
self.list.append({'name':tag, 'num':0})
284
if self.list: self.list.pop()
291
if self.list: li = self.list[-1]
292
else: li = {'name':'ul', 'num':0}
293
self.o(" "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.
294
if li['name'] == "ul": self.o("* ")
295
elif li['name'] == "ol":
297
self.o(`li['num']`+". ")
302
if tag in ['tr']: self.pbr()
313
if self.p_p == 0: self.p_p = 1
315
def p(self): self.p_p = 2
317
def o(self, data, puredata=0, force=0):
319
if puredata and not self.pre:
320
data = re.sub('\s+', ' ', data)
321
if data and data[0] == ' ':
324
if not data and not force: return
327
#self.out(" :") #TODO: not output when already one there
330
bq = (">" * self.blockquote)
331
if not (force and data and data[0] == ">") and self.blockquote: bq += " "
335
data = data.replace("\n", "\n"+bq)
350
self.out(('\n'+bq)*self.p_p)
354
if not self.lastWasNL: self.out(' ')
357
if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
358
if force == "end": self.out("\n")
362
if self.outcount > link['outcount']:
363
self.out(" ["+`link['count']`+"]: " + link['href']) #TODO: base href
364
if link.has_key('title'): self.out(" ("+link['title']+")")
369
if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
375
self.lastWasNL = data and data[-1] == '\n'
378
def handle_data(self, data):
381
def unknown_decl(self, data): pass
383
def html2text_file(html, out=sys.stdout.write):
390
return optwrap(html2text_file(html, None))
392
if __name__ == "__main__":
395
if arg.startswith('http://'):
396
data = urllib.urlopen(arg).read()
398
data = open(arg, 'r').read()
400
data = sys.stdin.read()