4
this file converts simple html text into a docbook xml variant.
5
The mapping of markups and links is far from perfect. But all we
6
want is the docbook-to-pdf converter and similar technology being
7
present in the world of docbook-to-anything converters. """
9
from datetime import date
15
class htm2dbk_conversion_base:
17
m()("</[hH]2>(.*)", "m") >> "</title>\n<subtitle>\\1</subtitle>",
18
m()("<[hH]2>") >> "<sect1 id=\"--filename--\"><title>",
19
m()("<[Pp]([> ])","m") >> "<para\\1",
20
m()("</[Pp]>") >> "</para>",
21
m()("<(pre|PRE)>") >> "<screen>",
22
m()("</(pre|PRE)>") >> "</screen>",
23
m()("<[hH]3>") >> "<sect2><title>",
24
m()("</[hH]3>((?:.(?!<sect2>))*.?)", "s") >> "</title>\\1</sect2>",
25
m()("<!doctype [^<>]*>","s") >> "",
26
m()("<!DOCTYPE [^<>]*>","s") >> "",
27
m()("(<\w+\b[^<>]*\swidth=)(\d+\%)","s") >> "\\1\"\\2\"",
28
m()("(<\w+\b[^<>]*\s\w+=)(\d+)","s") >> "\\1\"\\2\"",
29
m()("&&") >> "\&\;\&\;",
30
m()("\$\<") >> "\$\<\;",
31
m()("&(\w+[\),])") >> "\&\;\\1",
32
m()("(</?)span(\s[^<>]*)?>","s") >> "\\1phrase\\2>",
33
m()("(</?)small(\s[^<>]*)?>","s") >> "\\1note\\2>",
34
m()("(</?)(b|em|i)>")>> "\\1emphasis>",
35
m()("(</?)(li)>") >> "\\1listitem>",
36
m()("(</?)(ul)>") >> "\\1itemizedlist>",
37
m()("(</?)(ol)>") >> "\\1orderedlist>",
38
m()("(</?)(dl)>") >> "\\1variablelist>",
39
m()("<dt\b([^<>]*)>","s") >> "<varlistentry\\1><term>",
40
m()("</dt\b([^<>]*)>","s") >> "</term>",
41
m()("<dd\b([^<>]*)>","s") >> "<listitem\\1>",
42
m()("</dd\b([^<>]*)>","s") >> "</listitem></varlistentry>",
43
m()("<table\b([^<>]*)>","s")
44
>> "<informaltable\\1><tgroup cols=\"2\"><tbody>",
45
m()("</table\b([^<>]*)>","s") >> "</tbody></tgroup></informaltable>",
46
m()("(</?)tr(\s[^<>]*)?>","s") >> "\\1row\\2>",
47
m()("(</?)td(\s[^<>]*)?>","s") >> "\\1entry\\2>",
48
m()("<informaltable\b[^<>]*>\s*<tgroup\b[^<>]*>\s*<tbody>"+
49
"\s*<row\b[^<>]*>\s*<entry\b[^<>]*>\s*<informaltable\b","s")
51
m()("</informaltable>\s*</entry>\s*</row>"+
52
"\s*</tbody>\s*</tgroup>\s*</informaltable>", "s")
53
>> "</informaltable>",
54
m()("(<informaltable[^<>]*\swidth=\"100\%\")","s") >> "\\1 pgwide=\"1\"",
55
m()("(<tbody>\s*<row[^<>]*>\s*<entry[^<>]*\s)(width=\"50\%\")","s")
56
>> "<colspec colwidth=\"1*\" /><colspec colwidth=\"1*\" />\n\\1\\2",
57
m()("<nobr>([\'\`]*)<tt>") >> "<cmdsynopsis>\\1",
58
m()("</tt>([\'\`]*)</nobr>") >> "\\1</cmdsynopsis>",
59
m()("<nobr><(?:tt|code)>([\`\"\'])") >> "<cmdsynopsis>\\1",
60
m()("<(?:tt|code)><nobr>([\`\"\'])") >> "<cmdsynopsis>\\1",
61
m()("([\`\"\'])</(?:tt|code)></nobr>") >> "\\1</cmdsynopsis>",
62
m()("([\`\"\'])</nobr></(?:tt|code)>") >> "\\1</cmdsynopsis>",
63
m()("(</?)tt>") >> "\\1constant>",
64
m()("(</?)code>") >> "\\1literal>",
65
m()(">([^<>]+)<br>","s") >> "><highlights>\\1</highlights>",
66
m()("<br>") >> "<br />",
67
# m()("<date>") >> "<sect1info><date>",
68
# m()("</date>") >> "</date></sect1info>",
69
m()("<reference>") >> "<reference id=\"reference\">" >> 1,
70
m()("<a\s+href=\"((?:http|ftp|mailto):[^<>]+)\"\s*>((?:.(?!</a>))*.)</a>"
71
,"s") >> "<ulink url=\"\\1\">\\2</ulink>",
72
m()("<a\s+href=\"zziplib.html\#([\w_]+)\"\s*>((?:.(?!</a>))*.)</a>","s")
73
>> "<link linkend=\"$1\">$2</link>",
74
m()("<a\s+href=\"(zziplib.html)\"\s*>((?:.(?!</a>))*.)</a>","s")
75
>> "<link linkend=\"reference\">$2</link>",
76
m()("<a\s+href=\"([\w-]+[.]html)\"\s*>((?:.(?!</a>))*.)</a>","s")
77
>> "<link linkend=\"\\1\">\\2</link>",
78
m()("<a\s+href=\"([\w-]+[.](?:h|c|am|txt))\"\s*>((?:.(?!</a>))*.)</a>"
79
,"s") >> "<ulink url=\"file:\\1\">\\2</ulink>",
80
m()("<a\s+href=\"([A-Z0-9]+[.][A-Z0-9]+)\"\s*>((?:.(?!</a>))*.)</a>","s")
81
>> "<ulink url=\"file:\\1\">\\2</ulink>"
82
# m()("(</?)subtitle>") >> "\\1para>"
83
# $_ .= "</sect1>" if /<sect1[> ]/
86
m()(r"<br\s*/?>") >> "",
87
m()(r"(</?)em>") >> r"\1emphasis>",
88
m()(r"<code>") >> "<userinput>",
89
m()(r"</code>") >> "</userinput>",
90
m()(r"<link>") >> "<function>",
91
m()(r"</link>") >> "</function>",
92
m()(r"(?s)\s*</screen>") >> "</screen>",
93
# m()(r"<ul>") >> "</para><programlisting>\n",
94
# m()(r"</ul>") >> "</programlisting><para>",
95
m()(r"<ul>") >> "<itemizedlist>",
96
m()(r"</ul>") >> "</itemizedlist>",
99
m()(r"<li>") >> "<listitem><para>",
100
m()(r"</li>") >> "</para></listitem>\n",
102
class htm2dbk_conversion(htm2dbk_conversion_base):
104
self.version = "" # str(date.today)
106
def convert(self,text): # $text
107
txt = text.replace("<!--VERSION-->", self.version)
108
for conv in self.regexlist:
110
return txt.replace("--filename--", self.filename)
111
def convert2(self,text): # $text
112
txt = text.replace("<!--VERSION-->", self.version)
113
for conv in self.regexlist:
117
class htm2dbk_document(htm2dbk_conversion):
118
""" create document, add(text) and get the value() """
120
'<!DOCTYPE book PUBLIC "-//OASIS//DTD'+
121
' DocBook XML V4.1.2//EN"'+"\n"+
122
' "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd">'+
124
book_start = '<book><chapter><title>Documentation</title>'+"\n"
125
book_end_chapters = '</chapter>'+"\n"
126
book_end = '</book>'+"\n"
128
htm2dbk_conversion.__init__(self)
129
self.text = self.doctype + self.book_start
131
if self.text & m()("<reference"):
132
self.text += self.book_end_chapters ; self.book_end_chapters = ""
133
self.text += self.convert(text).replace(
135
m()("<link>([^<>]*)</link>") >> "<function>\\1</function>") & (
136
m()("(?s)(<refentryinfo>\s*)<sect1info>" +
137
"(<date>[^<>]*</date>)</sect1info>") >> "\\1\\2")
139
return self.text + self.book_end_chapters + self.book_end
141
def htm2dbk_files(args):
142
doc = htm2dbk_document()
143
for filename in args:
145
f = open(filename, "r")
146
doc.filename = filename
150
print >> sys.stderr, "can not open "+filename
153
def html2docbook(text):
154
""" the C comment may contain html markup - simulate with docbook tags """
155
return htm2dbk_conversion().convert2(text)
157
if __name__ == "__main__":
158
print htm2dbk_files(sys.argv[1:])