3
# $Id: TidyTools.py 1862 2004-06-18 07:31:02Z Fredrik $
5
# tools to run the "tidy" command on an HTML or XHTML file, and return
6
# the contents as an XHTML element tree.
9
# 2002-10-19 fl added to ElementTree library; added getzonebody function
11
# Copyright (c) 1999-2004 by Fredrik Lundh. All rights reserved.
13
# fredrik@pythonware.com
14
# http://www.pythonware.com
18
# Tools to build element trees from HTML, using the external <b>tidy</b>
22
import glob, string, os, sys
24
from ElementTree import ElementTree, Element
26
NS_XHTML = "{http://www.w3.org/1999/xhtml}"
29
# Convert an HTML or HTML-like file to XHTML, using the <b>tidy</b>
30
# command line utility.
32
# @param file Filename.
33
# @param new_inline_tags An optional list of valid but non-standard
35
# @return An element tree, or None if not successful.
37
def tidy(file, new_inline_tags=None):
39
command = ["tidy", "-qn", "-asxml"]
42
command.append("--new-inline-tags")
43
command.append(string.join(new_inline_tags, ","))
45
# FIXME: support more tidy options!
49
"%s %s >%s.out 2>%s.err" % (string.join(command), file, file, file)
51
# check that the result is valid XML
54
tree.parse(file + ".out")
56
print "*** %s:%s" % sys.exc_info()[:2]
57
print ("*** %s is not valid XML "
58
"(check %s.err for info)" % (file, file))
61
if os.path.isfile(file + ".out"):
62
os.remove(file + ".out")
63
if os.path.isfile(file + ".err"):
64
os.remove(file + ".err")
69
# Get document body from a an HTML or HTML-like file. This function
70
# uses the <b>tidy</b> function to convert HTML to XHTML, and cleans
71
# up the resulting XML tree.
73
# @param file Filename.
74
# @return A <b>body</b> element, or None if not successful.
76
def getbody(file, **options):
77
# get clean body from text file
81
tree = apply(tidy, (file,), options)
90
# remove namespace uris
91
for node in tree.getiterator():
92
if node.tag.startswith(NS):
93
node.tag = node.tag[len(NS):]
95
body = tree.getroot().find("body")
100
# Same as <b>getbody</b>, but turns plain text at the start of the
101
# document into an H1 tag. This function can be used to parse zone
104
# @param file Filename.
105
# @return A <b>body</b> element, or None if not successful.
107
def getzonebody(file, **options):
109
body = getbody(file, **options)
113
if body.text and string.strip(body.text):
114
title = Element("h1")
115
title.text = string.strip(body.text)
117
body.insert(0, title)
123
if __name__ == "__main__":
126
for arg in sys.argv[1:]:
127
for file in glob.glob(arg):
128
print file, "...", tidy(file)