3
# This is a shell script that calls a perl script;
4
if [ -e /usr/local/bin/perl ] ; then
5
exec /usr/local/bin/perl -x $0 ${1+"$@"}
7
exec perl -x $0 ${1+"$@"}
10
# The reason for this is because MaraDNS is built on Centos 3.X
11
# (RHEL 3 clone) and the Perl Centos comes with takes over 30 seconds to make
12
# the changelog; Perl 5.8.7 does it in under 4 seconds.
14
# /usr/local/bin/perl is a local compile of Perl which, on MaraDNS' build
15
# system, is able to quickly process the changelog. The script looks for
16
# Perl at /usr/local/bin/perl then anywhere else in the user's path.
19
# Convert an ej-formatted doc in to a man page
20
# Input: First argument or standard input
21
# Output: Standard output
23
$FILENAME = shift || "/////";
25
# Tmp dir (used for running iconv on non-8859-1 pages)
26
$TMP = $ENV{'HOME'} . "/tmp";
27
# Make this just /tmp at your own risk. You have been warned.
30
die "Fatal: Please create a directory entitled " . $TMP . "\n";
35
# This makes the script happy when run with both Perl 5.8.0 and
36
# Perl 5.8.8; basically 5.8.0 had a lot of unhappy Unicode bugs
37
# and so they changed the behavior for later releases. It is possible
38
# to make a Unicode-happy script that runs unchanged Unicode-happy
39
# on both 5.8.0 and 5.8.8, but the contortions I had to do were
42
# I would like to thank all of the helpful people in the newsgroup
43
# comp.lang.perl.misc for their assistance; I couldn't have done it
47
if($FILENAME ne "/////") {
49
open(STDIN,"< $FILENAME");
52
binmode(STDIN,":utf8");
54
while(<STDIN>){$doc .= $_}
56
#$* = 1; # Match multiple lines
58
# Get rid of <!-- ... --> comments
59
$doc =~ s|<\!\-\-.*?\-\->||msg;
62
if($doc =~ m|<head>(.*?)</head>|ims) {
66
die "Fatal: Document must have a heading section\n";
69
# Make sure the header has
70
# <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=XXX">
71
# Where XXX is any character set
73
m|meta\s+http\-equiv\=\"content\-type\"\s+content\=\"text\/html\;\s+charset=|i)
75
print "Please have somthing like this:\n";
77
'<META HTTP-EQUIV="Content-Type" CONTENT="text/html; CHARSET=utf-8">';
79
die "Fatal: Header must declare charset\n";
82
if($header =~ /charset=([^"]+)/i) {
84
#print "Charset: $charset\n";
87
die "Fatal: Error determining charset of document\n";
91
# Sanitize charset; only allow letters, numbers, and the dash
92
$charset =~ s/[^A-Za-z0-9\-]//g;
94
#$doc = conv_to_latin1($doc); # Not in Fedora core 2
96
# OK, see if we have a DTWIDTH header in the ej document. If so, use that
97
# value to determine how wide to make "dt" values in the formatted man page
99
if($header =~ m|<dtwidth>(.*?)</dtwidth>|ims) {
102
die "Fatal: DTWIDTH tag can only have a numeric argument.\n";
107
# The nroff to convert a DT tag in to
111
# OK, the header looks kosher. Start generating nroff
113
print '.\" Do *not* edit this file; it was automatically generated by ej2man';
115
print '.\" Look for a name.ej file with the same name as this filename';
118
#print '.\" Process this file with the following on iso-8859-1 terminals:'."\n";
119
#print '.\" nroff -man -Tlatin1 maradns.8' . "\n";
120
print '.\" Process this file with the following' . "\n";
121
print '.\" nroff -man -Tutf8 maradns.8 | tr \'\020\' \' \'' . "\n";
124
$ts = localtime(time());
125
print '.\" Last updated ' . $ts . "\n";
128
# OK, see if we have a TH header in the ej document. If so, add that to the
129
# man page. If not, generate a generic TH
131
if($header =~ m|<th>(.*?)</th>|ims) {
138
print '.\" We don\'t want hyphenation (it\'s too ugly)' . "\n";
139
print '.\" We also disable justification when using nroff' . "\n";
140
print '.\" Due to the way the -mandoc macro works, this needs to be placed'
142
print '.\" after the .TH heading' . "\n";
146
print '.\" We need the following stuff so that we can have single quotes' .
148
print '.\" In both groff and other UNIX *roff processors' . "\n";
149
print '.if \n(.g .mso www.tmac' . "\n";
150
print '.ds aq \(aq' . "\n";
151
print '.if !\\n(.g .if \'\\(aq\'\' .ds aq \\\'' . "\n";
154
# Enough of header processing; let's get to the body of the document
157
if($doc =~ m|<body>(.*?)</body>|ims) {
161
die "Fatal: Document must have a body section\n";
164
$body = process_body($body,0);
172
# And this processes the body (we do this way so we can recursively handle
173
# those pesky PRE flags)
175
my($body,$inrecurse) = @_;
176
my($hack,$rest,$filename);
180
while($body =~ m|\<include\s+\"([^"]+)\"\s*\>|ims) {
182
open(FILE,"< $filename") || die "Can not find file $filename\n";
184
while(<FILE>) {$hack .= $_}
186
#$hack = conv_to_latin1($hack);
187
#$hack = process_body($hack);
188
$body =~ s|\<include\s+\"([^"]+)\"\s*\>|$hack|ims;
191
# Get rid of any </?BLOCKQUOTE> tags in bulletted lists; the NROFF macros
192
# can not handle nesting
193
if($inrecurse == 0) {
194
@parts = split(m|</?ul>|im,$body);
197
for($hack = 0; $hack <= $#parts; $hack++) {
198
if($hack % 2 == 0) { # If we are not in a bulleted list
199
$body .= $parts[$hack];
200
if($hack < $#parts) {
205
$parts[$hack] =~ s|</?blockquote>||g;
206
$body .= $parts[$hack];
207
$body .= "\n</ul>\n";
213
# is made a simple whitespace
214
while($body =~ m|\ \;|ims) {
215
$body =~ s|\ \;| |ims;
218
# The HIBIT tag (replace with ALT text then remove)
219
while($body =~ m|<hibit alt=\"([^"]*)\">(.*?)</hibit>|ims) {
220
$body =~ s|<hibit alt=\"([^"]*)\">(.*?)</hibit>|$1|ims;
222
$body =~ s|<hibit[^>]*>(.*?)</hibit>||imsg;
223
# Because of how PRE is handled, we need to delete anything starting
224
# with <hibit> to the end of string and anything from the beginning
225
# of the string to </hibit>, since we may only be processing a subpart
226
# of the whole text. (Disabled because this is above the PRE processing
227
#$body =~ s|<hibit alt=\"([^"]*)\">.*$|$1|is;
228
#$body =~ s|<hibit[^>]*>.*$||is;
229
#$body =~ s|^.*</hibit>||is;
232
@parts = split(m|</?pre>|im,$body);
235
for($hack=0;$hack <= $#parts; $hack++) {
236
if($hack %2 == 0) { # If we are not in a <pre> section
237
$body .= process_body($parts[$hack],1);
241
# Deal with back slashes
242
$parts[$hack] =~ s/\\/\\\\/g;
243
# Make single quotes literal single quotes ('\(aq' in troff)
244
$parts[$hack] =~ s/\'/\\\(aq/g;
245
# Handle the á character (Debian's lint complains if the
246
# man page has raw hi-bit characters)
247
$parts[$hack] =~ s|\xc3\xa1|\\\(\'a|g;
248
$body .= $parts[$hack];
253
# Make á \('a so *roff can digest this. We have to put this here
254
# so the sequence remains a *roff command
255
$body =~ s|á|\\\(\'a|g;
260
# Backslashes need to be escaped in *roff source
261
$body =~ s/\\/\\\\/g;
262
# As do single quotes
263
$body =~ s/\'/\\\(aq/g;
266
while($body =~ m|<h1>(.*?)</h1>|ims) {
270
$body =~ s|<h1>(.*?)</h1>|\n.SH "$hack"\n.PP\n|ims;
274
while($body =~ m|<h2>(.*?)</h2>|ims) {
278
$body =~ s|<h2>(.*?)</h2>|\n.PP\n.in -3\n\\fB$hack\\fR\n.PP\n|ims;
281
# The A tag (and /A closer)
282
$body =~ s|</?a[^>]+>||img;
283
$body =~ s|</?a>||img;
285
# The TT tag (and /TT closer)
286
$body =~ s|</?tt>||img;
289
$body =~ s|<hr>|\n.PP\n.RS 28\n* * *\n.RE\n.PP\n|img;
292
$body =~ s|<hinclude[^>]+>||img;
295
$body =~ s|<blockquote>|\n.PP\n.RS 4\n|img;
296
$body =~ s|</blockquote>|\n.RE\n.PP\n|img;
299
while($body =~ m|<b>(.*?)</b>(\S+)?|ims) {
302
if($rest =~ /[<>]/) {
303
die "ej2man can't handle a tag immediately after a B tag\nthe offending text is $rest\n";
305
if($hack =~ m|<\?i>|) {
306
die "No I tags are allowed inside B tags\n";
312
$body =~ s|<b>(.*?)</b>\S+|\n.BR "$hack" "$rest"\n|ims;
315
$body =~ s|<b>(.*?)</b>|\n.B "$hack"\n|ims;
320
while($body =~ m|<i>(.*?)</i>(\S+)?|ims) {
323
if($rest =~ /[<>]/) {
324
print "The stuff in the I: $hack\n";
325
die "ej2man can't handle a tag immediately after a I tag\nthe offending text is $rest\n";
327
if($hack =~ m|<\?b>|) {
328
die "No B tags are allowed inside I tags\n";
334
$body =~ s|<i>(.*?)</i>\S+|\n.IR "$hack" "$rest"\n|ims;
337
$body =~ s|<i>(.*?)</i>|\n.I "$hack"\n|ims;
341
# Get rid of any multiple newlines
342
$body =~ s/\n(\s*)\n/\n/msg;
345
$body =~ s/<p>\s*/\n.PP\n/img;
347
# The UL and tags (just nuke them)
348
$body =~ s/<[du]l>//img;
351
#$body =~ s/<li>\n?/\n.TP 2\n•\n/img; # Can't do because of groff bug
352
#$body =~ s/<li>\n?/\n.TP 2\nʘ\n/img; # World isn't ready for unicode
353
$body =~ s/<li>\n?/\n.TP 2\n*\n/img;
356
while($body =~ /<dt>(.*?)<dd>\n?/sim) {
359
$body =~ s/<dt>(.*?)<dd>\n?/\n$DTROFF\n$hack\n/sim;
362
# The /DT and /DD tags (just nuke them)
363
$body =~ s|</d[td]>||img;
365
# The /UL and /DL tag (which we don't ignore)
366
$body =~ s|</[du]l>|\n.PP\n|img;
368
# Get rid of leading space; this confuses nroff
369
$body =~ s/\n[ \t]+/\n/msg;
371
# Get rid of empty lines before a .TP or .PP flag; this never looks nice
372
$body =~ s/\n+(\n\.[TP]P)/$1/msg;
374
# Get rid of empty lines at the beginning of the segment which come
375
# before a .TP; this covers the case of a </pre> before a <li> in the
377
$body =~ s/^\n+(\.[TP]P)/$1/ms;
379
# Same with empty lines before an .RE flag; this does not look nice
380
$body =~ s/^\n+(\.RE)/$1/ms;
382
# Get rid of multiple empty lines together; this never looks nice
383
# when formatted by Nroff
384
$body =~ s/\n\n\n+/\n\n/msg;
386
# Get rid of empty lines at the end of a segment after a .TP or .PP
387
# flag to work around how <pre> tags are handled
388
$body =~ s/(\n\.[TP]P)\s*$/$1/ms;
390
# Put a newline before the .RE flag; this looks nicer
391
$body =~ s/(\n\.RE)/\n$1/msg;
392
$body =~ s/^(\.RE)/\n$1/msg;
394
# Get rid of empty lines after a .RE flag; this does not look nice either
395
$body =~ s/(\n\.RE.*?\n)\n+/$1/msg;
397
# Put a newline before the .in flag; this looks nicer
398
$body =~ s/(\n\.in)/\n$1/msg;
400
# Get rid of empty lines after a .TP or .PP flag; this never looks nice
401
$body =~ s/(\n\.[TP]P.*?\n)\n+/$1/msg;
403
# Get rid of multiple spaces; nroff (unlike EJ) honors them
404
$body =~ s/[ \t]+/ /sg;
406
# The TABLE tags (TABLE, TD, TR, /TABLE)
407
$body =~ s|<table>|.ta +5 +7 +7|ig;
408
$body =~ s|<td>|\t|ig;
409
# We also process .br tags
410
$body =~ s|<[tb]r>\n?|\n.br\n|ig;
411
$body =~ s|</table>||ig;
413
# Break long lines so the nroff source is more legible
419
# This takes a string, and braks any lines longer than 75 columns; otherwise
420
# it performs no other formatting
421
# Input: The string to format
422
# Output: The formatted string
426
my($place,$lastspace,$column,$linebegin);
428
$place = $lastspace = $column = $linebegin = 0;
430
# Get rid of trailing white space, which confuses this algorithm
431
$input =~ s/[ \t]+\n/\n/sg;
434
while($place < length($input)) {
435
# If we hit a whitespace, remember that this is where the last
436
# (previous) space character is
437
if(substr($input,$place,1) =~ /[ \t]/) {
440
# If we hit the end of a line reset the counters which tell us when
442
if(substr($input,$place,1) =~ /\n/) {
444
$linebegin = $lastspace = $place + 1;
446
# This adds the newline as needed. Note that we do not break
447
# lines which start with a .; this means the line has a man macro
448
# and breaking the line will change the formatting of the page
449
if($column > 70 && $linebegin != $lastspace &&
450
substr($input,$linebegin,1) !~ /\./) {
451
substr($input,$lastspace,1,"\n");
454
$linebegin = $lastspace = $place + 1;
463
# Convert a given string from whatever encoding to latin1, since groff is
464
# still in the dark ages and can not handle utf-8 input (not true in
466
# Input: String to convert
467
# Output: Converted string
468
# Global variables used: $charset (assumed to already be sanitized)
469
# $TMP (also assumed to be sanitized)
472
if($charset =~ /latin1/i || $charset =~ /8859.1/) {
475
open(TFILE,"> $TMP/conv.$$") || die "Can not open $TMP/conv.$$: $!\n";
478
# if($charset !~ /utf.?8/i) {
479
system("iconv -f $charset -t latin1 $TMP/conv.$$ > $TMP/converted.$$");
482
# system("utf8tol1 < $TMP/conv.$$ > $TMP/converted.$$");
484
open(TFILE,"< $TMP/converted.$$") ||
485
die "Can not open $TMP/converted.$$: $!\n";
493
#unlink("$TMP/conv.$$") || die "Can not erase $TMP/conv.$$: $!\n";
494
#unlink("$TMP/converted.$$") ||die "Can not erase $TMP/convierted.$$: $!\n";