~ubuntu-branches/ubuntu/feisty/rss2email/feisty

« back to all changes in this revision

Viewing changes to html2text.py

  • Committer: Bazaar Package Importer
  • Author(s): Joey Hess
  • Date: 2006-12-13 14:34:42 UTC
  • mfrom: (2.1.7 feisty)
  • Revision ID: james.westby@ubuntu.com-20061213143442-bdyafysycipud6q6
Tags: 1:2.60-3
Check exit status of sendmail, and die if it fails. Closes: #402725 

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
"""html2text: Turn HTML into equivalent Markdown-structured text."""
2
 
__version__ = "2.2"
3
 
__author__ = "Aaron Swartz (me@aaronsw.com)"
4
 
__copyright__ = "(C) 2004 Aaron Swartz. GNU GPL 2."
5
 
__contributors__ = ["Martin 'Joey' Schulze"]
6
 
 
7
 
# TODO:
8
 
#   Support decoded entities with unifiable.
9
 
#       Word wrap. 
10
 
#       Fix :s using buffering
11
 
#       Relative URl resolution
12
 
 
13
 
import re, sys, urllib, htmlentitydefs, codecs, StringIO, types
14
 
import sgmllib
15
 
sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
16
 
 
17
 
try: from textwrap import wrap
18
 
except: pass
19
 
 
20
 
# Use Unicode characters instead of their ascii psuedo-replacements
21
 
UNICODE_SNOB = 0
22
 
 
23
 
# Put the links after each paragraph instead of at the end.
24
 
LINKS_EACH_PARAGRAPH = 0
25
 
 
26
 
# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
27
 
BODY_WIDTH = 0
28
 
 
29
 
### Entity Nonsense ###
30
 
 
31
 
def name2cp(k):
32
 
        if k == 'apos': return ord("'")
33
 
        if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
34
 
                return htmlentitydefs.name2codepoint[k]
35
 
        else:
36
 
                k = htmlentitydefs.entitydefs[k]
37
 
                if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
38
 
                return ord(codecs.latin_1_decode(k)[0])
39
 
 
40
 
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', 
41
 
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
42
 
'ndash':'-', 'oelig':'oe', 'aelig':'ae',
43
 
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', 
44
 
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', 
45
 
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
46
 
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', 
47
 
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
48
 
 
49
 
unifiable_n = {}
50
 
 
51
 
for k in unifiable.keys():
52
 
        unifiable_n[name2cp(k)] = unifiable[k]
53
 
 
54
 
def charref(name):
55
 
        if name[0] in ['x','X']:
56
 
                c = int(name[1:], 16)
57
 
        else:
58
 
                c = int(name)
59
 
        
60
 
        if not UNICODE_SNOB and c in unifiable_n.keys():
61
 
                return unifiable_n[c]
62
 
        else:
63
 
                return unichr(c)
64
 
 
65
 
def entityref(c):
66
 
        if not UNICODE_SNOB and c in unifiable.keys():
67
 
                return unifiable[c]
68
 
        else:
69
 
                try: name2cp(c)
70
 
                except KeyError: return "&" + c
71
 
                else: return unichr(name2cp(c))
72
 
 
73
 
def replaceEntities(s):
74
 
        s = s.group(1)
75
 
        if s[0] == "#": 
76
 
                return charref(s[1:])
77
 
        else: return entityref(s)
78
 
 
79
 
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
80
 
def unescape(s):
81
 
        return r_unescape.sub(replaceEntities, s)
82
 
        
83
 
def fixattrs(attrs):
84
 
        # Fix bug in sgmllib.py
85
 
        if not attrs: return attrs
86
 
        newattrs = []
87
 
        for attr in attrs:
88
 
                newattrs.append((attr[0], unescape(attr[1])))
89
 
        return newattrs
90
 
 
91
 
### End Entity Nonsense ###
92
 
 
93
 
def onlywhite(line):
94
 
        """Return true if the line does only consist of whitespace characters."""
95
 
        for c in line:
96
 
                if c is not ' ' and c is not '  ':
97
 
                        return c is ' '
98
 
        return line
99
 
 
100
 
def optwrap(text):
101
 
        """Wrap all paragraphs in the provided text."""
102
 
        if not BODY_WIDTH:
103
 
                return text
104
 
        
105
 
        assert wrap # Requires Python 2.3.
106
 
        result = ''
107
 
        newlines = 0
108
 
        for para in text.split("\n"):
109
 
                if len(para) > 0:
110
 
                        if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
111
 
                                for line in wrap(para, BODY_WIDTH):
112
 
                                        result += line + "\n"
113
 
                                result += "\n"
114
 
                                newlines = 2
115
 
                        else:
116
 
                                if not onlywhite(para):
117
 
                                        result += para + "\n"
118
 
                                        newlines = 1
119
 
                else:
120
 
                        if newlines < 2:
121
 
                                result += "\n"
122
 
                                newlines += 1
123
 
        return result
124
 
 
125
 
def hn(tag):
126
 
        if tag[0] == 'h' and len(tag) == 2:
127
 
                try:
128
 
                        n = int(tag[1])
129
 
                        if n in range(1, 10): return n
130
 
                except ValueError: return 0
131
 
 
132
 
class _html2text(sgmllib.SGMLParser):
133
 
        def __init__(self, out=sys.stdout.write):
134
 
                sgmllib.SGMLParser.__init__(self)
135
 
                
136
 
                if out is None: self.out = self.outtextf
137
 
                else: self.out = out
138
 
                self.outtext = u''
139
 
                self.quiet = 0
140
 
                self.p_p = 0
141
 
                self.outcount = 0
142
 
                self.start = 1
143
 
                self.space = 0
144
 
                self.a = []
145
 
                self.astack = []
146
 
                self.acount = 0
147
 
                self.list = []
148
 
                self.blockquote = 0
149
 
                self.pre = 0
150
 
                self.startpre = 0
151
 
                self.lastWasNL = 0
152
 
        
153
 
        def outtextf(self, s): 
154
 
                if type(s) is type(''): s = codecs.utf_8_decode(s)[0]
155
 
                self.outtext += s
156
 
        
157
 
        def close(self):
158
 
                sgmllib.SGMLParser.close(self)
159
 
                
160
 
                self.pbr()
161
 
                self.o('', 0, 'end')
162
 
                
163
 
                return self.outtext
164
 
                
165
 
        def handle_charref(self, c):
166
 
                self.o(charref(c))
167
 
 
168
 
        def handle_entityref(self, c):
169
 
                self.o(entityref(c))
170
 
                        
171
 
        def unknown_starttag(self, tag, attrs):
172
 
                self.handle_tag(tag, attrs, 1)
173
 
        
174
 
        def unknown_endtag(self, tag):
175
 
                self.handle_tag(tag, None, 0)
176
 
                
177
 
        def previousIndex(self, attrs):
178
 
                """ returns the index of certain set of attributes (of a link) in the
179
 
                        self.a list
180
 
 
181
 
                        If the set of attributes is not found, returns None
182
 
                """
183
 
                if not attrs.has_key('href'): return None
184
 
                
185
 
                i = -1
186
 
                for a in self.a:
187
 
                        i += 1
188
 
                        match = 0
189
 
                        
190
 
                        if a.has_key('href') and a['href'] == attrs['href']:
191
 
                                if a.has_key('title') or attrs.has_key('title'):
192
 
                                                if (a.has_key('title') and attrs.has_key('title') and
193
 
                                                    a['title'] == attrs['title']):
194
 
                                                        match = True
195
 
                                else:
196
 
                                        match = True
197
 
 
198
 
                        if match: return i
199
 
 
200
 
        def handle_tag(self, tag, attrs, start):
201
 
                attrs = fixattrs(attrs)
202
 
        
203
 
                if hn(tag):
204
 
                        self.p()
205
 
                        if start: self.o(hn(tag)*"#" + ' ')
206
 
 
207
 
                if tag in ['p', 'div']: self.p()
208
 
                
209
 
                if tag == "br" and start: self.o("  \n")
210
 
 
211
 
                if tag == "hr" and start:
212
 
                        self.p()
213
 
                        self.o("* * *")
214
 
                        self.p()
215
 
 
216
 
                if tag in ["head", "style", 'script']: 
217
 
                        if start: self.quiet += 1
218
 
                        else: self.quiet -= 1
219
 
                
220
 
                if tag == "blockquote":
221
 
                        if start: 
222
 
                                self.p(); self.o('> ', 0, 1); self.start = 1
223
 
                                self.blockquote += 1
224
 
                        else:
225
 
                                self.blockquote -= 1
226
 
                                self.p()
227
 
                
228
 
                if tag in ['em', 'i', 'u']: self.o("_")
229
 
                if tag in ['strong', 'b']: self.o("**")
230
 
                if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
231
 
                
232
 
                if tag == "a":
233
 
                        if start:
234
 
                                attrsD = {}
235
 
                                for (x, y) in attrs: attrsD[x] = y
236
 
                                attrs = attrsD
237
 
                                if attrs.has_key('href'): 
238
 
                                        self.astack.append(attrs)
239
 
                                        self.o("[")
240
 
                                else:
241
 
                                        self.astack.append(None)
242
 
                        else:
243
 
                                if self.astack:
244
 
                                        a = self.astack.pop()
245
 
                                        if a:
246
 
                                                i = self.previousIndex(a)
247
 
                                                if i is not None:
248
 
                                                        a = self.a[i]
249
 
                                                else:
250
 
                                                        self.acount += 1
251
 
                                                        a['count'] = self.acount
252
 
                                                        a['outcount'] = self.outcount
253
 
                                                        self.a.append(a)
254
 
                                                self.o("][" + `a['count']` + "]")
255
 
                
256
 
                if tag == "img" and start:
257
 
                        attrsD = {}
258
 
                        for (x, y) in attrs: attrsD[x] = y
259
 
                        attrs = attrsD
260
 
                        if attrs.has_key('src'):
261
 
                                attrs['href'] = attrs['src']
262
 
                                alt = attrs.get('alt', '')
263
 
                                alt = re.sub('\n', ' ', alt)
264
 
                                i = self.previousIndex(attrs)
265
 
                                if i is not None:
266
 
                                        attrs = self.a[i]
267
 
                                else:
268
 
                                        self.acount += 1
269
 
                                        attrs['count'] = self.acount
270
 
                                        attrs['outcount'] = self.outcount
271
 
                                        self.a.append(attrs)
272
 
                                self.o("![")
273
 
                                self.o(alt)
274
 
                                self.o("]["+`attrs['count']`+"]")
275
 
                
276
 
                if tag in ["ol", "ul"]:
277
 
                        if start:
278
 
                                self.list.append({'name':tag, 'num':0})
279
 
                        else:
280
 
                                self.list.pop()
281
 
                        
282
 
                        self.p()
283
 
                
284
 
                if tag == 'li':
285
 
                        if start:
286
 
                                self.pbr()
287
 
                                if self.list: li = self.list[-1]
288
 
                                else: li = {'name':'ul', 'num':0}
289
 
                                self.o("  "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.
290
 
                                if li['name'] == "ul": self.o("* ")
291
 
                                elif li['name'] == "ol":
292
 
                                        li['num'] += 1
293
 
                                        self.o(`li['num']`+". ")
294
 
                                self.start = 1
295
 
                        else:
296
 
                                self.pbr()
297
 
                
298
 
                if tag in ['tr']: self.pbr()
299
 
                
300
 
                if tag == "pre":
301
 
                        if start:
302
 
                                self.startpre = 1
303
 
                                self.pre = 1
304
 
                        else:
305
 
                                self.pre = 0
306
 
                        self.p()
307
 
                        
308
 
        def pbr(self):
309
 
                if self.p_p == 0: self.p_p = 1
310
 
 
311
 
        def p(self): self.p_p = 2
312
 
        
313
 
        
314
 
        def o(self, data, puredata=0, force=0):
315
 
                if not self.quiet: 
316
 
                        if puredata and not self.pre:
317
 
                                data = re.sub('\s+', ' ', data)
318
 
                                if data and data[0] == ' ':
319
 
                                        self.space = 1
320
 
                                        data = data[1:]
321
 
                        if not data and not force: return
322
 
                        
323
 
                        if self.startpre:
324
 
                                self.out(" :") #TODO: not output when already one there
325
 
                                self.startpre = 0
326
 
                        
327
 
                        bq = (">" * self.blockquote)
328
 
                        if not (force and data and data[0] == ">") and self.blockquote: bq += " "
329
 
                        
330
 
                        if self.pre:
331
 
                                bq += "    "
332
 
                                data = data.replace("\n", "\n"+bq)
333
 
                        
334
 
                        if self.start:
335
 
                                self.space = 0
336
 
                                self.p_p = 0
337
 
                                self.start = 0
338
 
 
339
 
                        if force == 'end':
340
 
                                # It's the end.
341
 
                                self.p_p = 0
342
 
                                self.out("\n")
343
 
                                self.space = 0
344
 
 
345
 
 
346
 
                        if self.p_p:
347
 
                                self.out(('\n'+bq)*self.p_p)
348
 
                                self.space = 0
349
 
                                
350
 
                        if self.space:
351
 
                                if not self.lastWasNL: self.out(' ')
352
 
                                self.space = 0
353
 
 
354
 
                        if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
355
 
                                if force == "end": self.out("\n")
356
 
 
357
 
                                newa = []
358
 
                                for link in self.a:
359
 
                                        if self.outcount > link['outcount']:
360
 
                                                self.out("    ["+`link['count']`+"]: " + link['href']) #TODO: base href
361
 
                                                if link.has_key('title'): self.out(" ("+link['title']+")")
362
 
                                                self.out("\n")
363
 
                                        else:
364
 
                                                newa.append(link)
365
 
 
366
 
                                if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
367
 
 
368
 
                                self.a = newa
369
 
 
370
 
                        self.p_p = 0
371
 
                        self.out(data)
372
 
                        self.lastWasNL = data and data[-1] == '\n'
373
 
                        self.outcount += 1
374
 
 
375
 
        def handle_data(self, data):
376
 
                self.o(data, 1)
377
 
        
378
 
        def unknown_decl(self, data): pass
379
 
                
380
 
def html2text_file(html, out=sys.stdout.write):
381
 
        h = _html2text(out)
382
 
        h.feed(html)
383
 
        h.feed("")
384
 
        return h.close()
385
 
 
386
 
def html2text(html):
387
 
        return optwrap(html2text_file(html, None))
388
 
 
389
 
if __name__ == "__main__":
390
 
        if sys.argv[1:]:
391
 
                arg = sys.argv[1]
392
 
                if arg.startswith('http://'):
393
 
                        data = urllib.urlopen(arg).read()
394
 
                else:
395
 
                        data = open(arg, 'r').read()
396
 
        else:
397
 
                data = sys.stdin.read()
398
 
        html2text_file(data)
 
1
"""html2text: Turn HTML into equivalent Markdown-structured text."""
 
2
__version__ = "2.24"
 
3
__author__ = "Aaron Swartz (me@aaronsw.com)"
 
4
__copyright__ = "(C) 2004 Aaron Swartz. GNU GPL 2."
 
5
__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes"]
 
6
 
 
7
# TODO:
 
8
#   Support decoded entities with unifiable.
 
9
#       Relative URL resolution
 
10
 
 
11
if not hasattr(__builtins__, 'True'): True, False = 1, 0
 
12
import re, sys, urllib, htmlentitydefs, codecs, StringIO, types
 
13
import sgmllib
 
14
sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
 
15
 
 
16
try: from textwrap import wrap
 
17
except: pass
 
18
 
 
19
# Use Unicode characters instead of their ascii psuedo-replacements
 
20
UNICODE_SNOB = 0
 
21
 
 
22
# Put the links after each paragraph instead of at the end.
 
23
LINKS_EACH_PARAGRAPH = 0
 
24
 
 
25
# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
 
26
BODY_WIDTH = 0
 
27
 
 
28
### Entity Nonsense ###
 
29
 
 
30
def name2cp(k):
 
31
        if k == 'apos': return ord("'")
 
32
        if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
 
33
                return htmlentitydefs.name2codepoint[k]
 
34
        else:
 
35
                k = htmlentitydefs.entitydefs[k]
 
36
                if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
 
37
                return ord(codecs.latin_1_decode(k)[0])
 
38
 
 
39
unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"', 
 
40
'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
 
41
'ndash':'-', 'oelig':'oe', 'aelig':'ae',
 
42
'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a', 
 
43
'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e', 
 
44
'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
 
45
'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o', 
 
46
'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
 
47
 
 
48
unifiable_n = {}
 
49
 
 
50
for k in unifiable.keys():
 
51
        unifiable_n[name2cp(k)] = unifiable[k]
 
52
 
 
53
def charref(name):
 
54
        if name[0] in ['x','X']:
 
55
                c = int(name[1:], 16)
 
56
        else:
 
57
                c = int(name)
 
58
        
 
59
        if not UNICODE_SNOB and c in unifiable_n.keys():
 
60
                return unifiable_n[c]
 
61
        else:
 
62
                return unichr(c)
 
63
 
 
64
def entityref(c):
 
65
        if not UNICODE_SNOB and c in unifiable.keys():
 
66
                return unifiable[c]
 
67
        else:
 
68
                try: name2cp(c)
 
69
                except KeyError: return "&" + c
 
70
                else: return unichr(name2cp(c))
 
71
 
 
72
def replaceEntities(s):
 
73
        s = s.group(1)
 
74
        if s[0] == "#": 
 
75
                return charref(s[1:])
 
76
        else: return entityref(s)
 
77
 
 
78
r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
 
79
def unescape(s):
 
80
        return r_unescape.sub(replaceEntities, s)
 
81
        
 
82
def fixattrs(attrs):
 
83
        # Fix bug in sgmllib.py
 
84
        if not attrs: return attrs
 
85
        newattrs = []
 
86
        for attr in attrs:
 
87
                newattrs.append((attr[0], unescape(attr[1])))
 
88
        return newattrs
 
89
 
 
90
### End Entity Nonsense ###
 
91
 
 
92
def onlywhite(line):
 
93
        """Return true if the line does only consist of whitespace characters."""
 
94
        for c in line:
 
95
                if c is not ' ' and c is not '  ':
 
96
                        return c is ' '
 
97
        return line
 
98
 
 
99
def optwrap(text):
 
100
        """Wrap all paragraphs in the provided text."""
 
101
        if not BODY_WIDTH:
 
102
                return text
 
103
        
 
104
        assert wrap # Requires Python 2.3.
 
105
        result = ''
 
106
        newlines = 0
 
107
        for para in text.split("\n"):
 
108
                if len(para) > 0:
 
109
                        if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
 
110
                                for line in wrap(para, BODY_WIDTH):
 
111
                                        result += line + "\n"
 
112
                                result += "\n"
 
113
                                newlines = 2
 
114
                        else:
 
115
                                if not onlywhite(para):
 
116
                                        result += para + "\n"
 
117
                                        newlines = 1
 
118
                else:
 
119
                        if newlines < 2:
 
120
                                result += "\n"
 
121
                                newlines += 1
 
122
        return result
 
123
 
 
124
def hn(tag):
 
125
        if tag[0] == 'h' and len(tag) == 2:
 
126
                try:
 
127
                        n = int(tag[1])
 
128
                        if n in range(1, 10): return n
 
129
                except ValueError: return 0
 
130
 
 
131
class _html2text(sgmllib.SGMLParser):
 
132
        def __init__(self, out=sys.stdout.write):
 
133
                sgmllib.SGMLParser.__init__(self)
 
134
                
 
135
                if out is None: self.out = self.outtextf
 
136
                else: self.out = out
 
137
                self.outtext = u''
 
138
                self.quiet = 0
 
139
                self.p_p = 0
 
140
                self.outcount = 0
 
141
                self.start = 1
 
142
                self.space = 0
 
143
                self.a = []
 
144
                self.astack = []
 
145
                self.acount = 0
 
146
                self.list = []
 
147
                self.blockquote = 0
 
148
                self.pre = 0
 
149
                self.startpre = 0
 
150
                self.lastWasNL = 0
 
151
        
 
152
        def outtextf(self, s): 
 
153
                if type(s) is type(''): s = codecs.utf_8_decode(s, "replace")[0]
 
154
                self.outtext += s
 
155
        
 
156
        def close(self):
 
157
                sgmllib.SGMLParser.close(self)
 
158
                
 
159
                self.pbr()
 
160
                self.o('', 0, 'end')
 
161
                
 
162
                return self.outtext
 
163
                
 
164
        def handle_charref(self, c):
 
165
                self.o(charref(c))
 
166
 
 
167
        def handle_entityref(self, c):
 
168
                self.o(entityref(c))
 
169
                        
 
170
        def unknown_starttag(self, tag, attrs):
 
171
                self.handle_tag(tag, attrs, 1)
 
172
        
 
173
        def unknown_endtag(self, tag):
 
174
                self.handle_tag(tag, None, 0)
 
175
                
 
176
        def previousIndex(self, attrs):
 
177
                """ returns the index of certain set of attributes (of a link) in the
 
178
                        self.a list
 
179
 
 
180
                        If the set of attributes is not found, returns None
 
181
                """
 
182
                if not attrs.has_key('href'): return None
 
183
                
 
184
                i = -1
 
185
                for a in self.a:
 
186
                        i += 1
 
187
                        match = 0
 
188
                        
 
189
                        if a.has_key('href') and a['href'] == attrs['href']:
 
190
                                if a.has_key('title') or attrs.has_key('title'):
 
191
                                                if (a.has_key('title') and attrs.has_key('title') and
 
192
                                                    a['title'] == attrs['title']):
 
193
                                                        match = True
 
194
                                else:
 
195
                                        match = True
 
196
 
 
197
                        if match: return i
 
198
 
 
199
        def handle_tag(self, tag, attrs, start):
 
200
                attrs = fixattrs(attrs)
 
201
        
 
202
                if hn(tag):
 
203
                        self.p()
 
204
                        if start: self.o(hn(tag)*"#" + ' ')
 
205
 
 
206
                if tag in ['p', 'div']: self.p()
 
207
                
 
208
                if tag == "br" and start: self.o("  \n")
 
209
 
 
210
                if tag == "hr" and start:
 
211
                        self.p()
 
212
                        self.o("* * *")
 
213
                        self.p()
 
214
 
 
215
                if tag in ["head", "style", 'script']: 
 
216
                        if start: self.quiet += 1
 
217
                        else: self.quiet -= 1
 
218
                
 
219
                if tag == "blockquote":
 
220
                        if start: 
 
221
                                self.p(); self.o('> ', 0, 1); self.start = 1
 
222
                                self.blockquote += 1
 
223
                        else:
 
224
                                self.blockquote -= 1
 
225
                                self.p()
 
226
                
 
227
                if tag in ['em', 'i', 'u']: self.o("_")
 
228
                if tag in ['strong', 'b']: self.o("**")
 
229
                if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
 
230
                
 
231
                if tag == "a":
 
232
                        if start:
 
233
                                attrsD = {}
 
234
                                for (x, y) in attrs: attrsD[x] = y
 
235
                                attrs = attrsD
 
236
                                if attrs.has_key('href'): 
 
237
                                        self.astack.append(attrs)
 
238
                                        self.o("[")
 
239
                                else:
 
240
                                        self.astack.append(None)
 
241
                        else:
 
242
                                if self.astack:
 
243
                                        a = self.astack.pop()
 
244
                                        if a:
 
245
                                                i = self.previousIndex(a)
 
246
                                                if i is not None:
 
247
                                                        a = self.a[i]
 
248
                                                else:
 
249
                                                        self.acount += 1
 
250
                                                        a['count'] = self.acount
 
251
                                                        a['outcount'] = self.outcount
 
252
                                                        self.a.append(a)
 
253
                                                self.o("][" + `a['count']` + "]")
 
254
                
 
255
                if tag == "img" and start:
 
256
                        attrsD = {}
 
257
                        for (x, y) in attrs: attrsD[x] = y
 
258
                        attrs = attrsD
 
259
                        if attrs.has_key('src'):
 
260
                                attrs['href'] = attrs['src']
 
261
                                alt = attrs.get('alt', '')
 
262
                                alt = re.sub('\n', ' ', alt)
 
263
                                i = self.previousIndex(attrs)
 
264
                                if i is not None:
 
265
                                        attrs = self.a[i]
 
266
                                else:
 
267
                                        self.acount += 1
 
268
                                        attrs['count'] = self.acount
 
269
                                        attrs['outcount'] = self.outcount
 
270
                                        self.a.append(attrs)
 
271
                                self.o("![")
 
272
                                self.o(alt)
 
273
                                self.o("]["+`attrs['count']`+"]")
 
274
                
 
275
                if tag == 'dl' and start: self.p()
 
276
                if tag == 'dt' and not start: self.pbr()
 
277
                if tag == 'dd' and start: self.o('    ')
 
278
                if tag == 'dd' and not start: self.pbr()
 
279
 
 
280
                if tag in ["ol", "ul"]:
 
281
                        if start:
 
282
                                self.list.append({'name':tag, 'num':0})
 
283
                        elif self.list:
 
284
                                if self.list: self.list.pop()
 
285
                        
 
286
                        self.p()
 
287
                
 
288
                if tag == 'li':
 
289
                        if start:
 
290
                                self.pbr()
 
291
                                if self.list: li = self.list[-1]
 
292
                                else: li = {'name':'ul', 'num':0}
 
293
                                self.o("  "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.
 
294
                                if li['name'] == "ul": self.o("* ")
 
295
                                elif li['name'] == "ol":
 
296
                                        li['num'] += 1
 
297
                                        self.o(`li['num']`+". ")
 
298
                                self.start = 1
 
299
                        else:
 
300
                                self.pbr()
 
301
                
 
302
                if tag in ['tr']: self.pbr()
 
303
                
 
304
                if tag == "pre":
 
305
                        if start:
 
306
                                self.startpre = 1
 
307
                                self.pre = 1
 
308
                        else:
 
309
                                self.pre = 0
 
310
                        self.p()
 
311
                        
 
312
        def pbr(self):
 
313
                if self.p_p == 0: self.p_p = 1
 
314
 
 
315
        def p(self): self.p_p = 2
 
316
        
 
317
        def o(self, data, puredata=0, force=0):
 
318
                if not self.quiet: 
 
319
                        if puredata and not self.pre:
 
320
                                data = re.sub('\s+', ' ', data)
 
321
                                if data and data[0] == ' ':
 
322
                                        self.space = 1
 
323
                                        data = data[1:]
 
324
                        if not data and not force: return
 
325
                        
 
326
                        if self.startpre:
 
327
                                #self.out(" :") #TODO: not output when already one there
 
328
                                self.startpre = 0
 
329
                        
 
330
                        bq = (">" * self.blockquote)
 
331
                        if not (force and data and data[0] == ">") and self.blockquote: bq += " "
 
332
                        
 
333
                        if self.pre:
 
334
                                bq += "    "
 
335
                                data = data.replace("\n", "\n"+bq)
 
336
                        
 
337
                        if self.start:
 
338
                                self.space = 0
 
339
                                self.p_p = 0
 
340
                                self.start = 0
 
341
 
 
342
                        if force == 'end':
 
343
                                # It's the end.
 
344
                                self.p_p = 0
 
345
                                self.out("\n")
 
346
                                self.space = 0
 
347
 
 
348
 
 
349
                        if self.p_p:
 
350
                                self.out(('\n'+bq)*self.p_p)
 
351
                                self.space = 0
 
352
                                
 
353
                        if self.space:
 
354
                                if not self.lastWasNL: self.out(' ')
 
355
                                self.space = 0
 
356
 
 
357
                        if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
 
358
                                if force == "end": self.out("\n")
 
359
 
 
360
                                newa = []
 
361
                                for link in self.a:
 
362
                                        if self.outcount > link['outcount']:
 
363
                                                self.out("   ["+`link['count']`+"]: " + link['href']) #TODO: base href
 
364
                                                if link.has_key('title'): self.out(" ("+link['title']+")")
 
365
                                                self.out("\n")
 
366
                                        else:
 
367
                                                newa.append(link)
 
368
 
 
369
                                if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
 
370
 
 
371
                                self.a = newa
 
372
 
 
373
                        self.p_p = 0
 
374
                        self.out(data)
 
375
                        self.lastWasNL = data and data[-1] == '\n'
 
376
                        self.outcount += 1
 
377
 
 
378
        def handle_data(self, data):
 
379
                self.o(data, 1)
 
380
        
 
381
        def unknown_decl(self, data): pass
 
382
                
 
383
def html2text_file(html, out=sys.stdout.write):
 
384
        h = _html2text(out)
 
385
        h.feed(html)
 
386
        h.feed("")
 
387
        return h.close()
 
388
 
 
389
def html2text(html):
 
390
        return optwrap(html2text_file(html, None))
 
391
 
 
392
if __name__ == "__main__":
 
393
        if sys.argv[1:]:
 
394
                arg = sys.argv[1]
 
395
                if arg.startswith('http://'):
 
396
                        data = urllib.urlopen(arg).read()
 
397
                else:
 
398
                        data = open(arg, 'r').read()
 
399
        else:
 
400
                data = sys.stdin.read()
 
401
        html2text_file(data)
 
402