2
eval 'exec perl -S $0 "$@"'
3
if $runnning_under_some_shell;
6
# Convert raw text to something with a little HTML formatting
8
# Written by Seth Golub <seth@cs.wustl.edu>
9
# http://www.cs.wustl.edu/~seth/txt2html/
11
# Modified to serve the formatting requirements of GraphicsMagick by
12
# Bob Friesenhahn <bfriesen@simple.dallas.tx.us>
15
# $Date: 1994/12/28 20:10:25 $
19
# $Log: txt2html.pl,v $
20
# Revision 1.10 1994/12/28 20:10:25 seth
21
# * Added --extract, etc.
23
# Revision 1.9 94/12/13 15:16:23 15:16:23 seth (Seth Golub)
24
# * Changed from #!/usr/local/bin/perl to the more clever version in
25
# the man page. (How did I manage not to read this for so long?)
26
# * Swapped hrule & header back to handle double lines. Why should
27
# this order screw up headers?
29
# Revision 1.8 1994/11/30 21:07:03 seth
30
# * put mail_anchor back in. (Why did I take this out?)
31
# * Finally added handling of lettered lists (ordered lists marked with
33
# * Added title option (--title, -t)
34
# * Shortline now looks at how long the line was before txt2html
35
# started adding tags. ($line_length)
36
# * Changed list references to scalars where appropriate. (@foo[0] -> $foo[0])
37
# * Added untabify() to homogenize leading indentation for list
38
# prefixes and functions that use line length
39
# * Added "underline tolerance" for when underlines are not exactly the
40
# same length as what they underline.
41
# * Added error message for unrecognized options
42
# * removed \w matching on --capstag
43
# * Tagline now removes leading & trailing whitespace before tagging
44
# * swapped order of caps & heading in main loop
45
# * Cleaned up code for speed and to get rid of warnings
46
# * Added more restrictions to something being a mail header
47
# * Added indentation for lists, just to make the output more readable.
48
# * Fixed major bug in lists: $OL and $UL were never set, so when a
49
# list was ended "</UL>" was *always* used!
50
# * swapped order of hrule & header to properly handle long underlines
52
# Revision 1.7 94/10/28 13:16:11 13:16:11 seth (Seth Golub)
53
# * Added to comments in options section
54
# * renamed blank to is_blank
55
# * Page break is converted to horizontal rule <HR>
56
# * moved usage subroutine up top so people who look through code see
59
# Revision 1.6 94/10/28 12:43:46 12:43:46 seth (Seth Golub)
60
# * Creates anchors at each heading
62
# Revision 1.5 94/07/14 17:43:59 17:43:59 seth (Seth Golub)
63
# * Fixed minor bug in Headers
64
# * Preformatting can be set to only start/stop when TWO lines of
65
# [non]formatted-looking-text are encountered. Old behavior is still
66
# possible through command line options (-pb 1 -pe 1).
67
# * Can preformat entire document (-pb 0) or disable preformatting
69
# * Fixed minor bug in CAPS handling (paragraph breaks broke)
70
# * Puts paragraph tags *before* paragraphs, not just between them.
72
# Revision 1.4 94/06/20 16:42:55 16:42:55 seth (Seth Golub)
73
# * Allow ':' for numbered lists (e.g. "1: Figs")
74
# * Whitespace at end of line will not start or end preformatting
75
# * Mailmode is now off by default
76
# * Doesn't break short lines if they are the first line in a list
77
# item. It *should* break them anyway if the next line is a
78
# continuation of the list item, but I haven't dealt with this yet.
79
# * Added action on lines that are all capital letters. You can change
80
# how these lines get tagged, as well as the mininum number of
81
# consecutive capital letters required to fire off this action.
83
# Revision 1.3 94/05/17 15:58:58 15:58:58 seth (Seth Golub)
84
# * Tiny bugfix in unhyphenation
86
# Revision 1.2 94/05/16 18:15:16 18:15:16 seth (Seth Golub)
87
# * Added unhyphenation
89
# Revision 1.1 94/05/16 16:19:03 16:19:03 seth (Seth Golub)
93
# 1.02 Allow '-' in mail headers
94
# Added handling for multiline mail headers
98
# Oscar Nierstrasz has a nice script for hypertextifying URLs.
100
# http://cui_www.unige.ch/ftp/PUBLIC/oscar/scripts/html.pl
103
#########################
104
# Configurable options
107
# [-s <n> ] | [--shortline <n> ]
108
$short_line_length = 40; # Lines this short (or shorter) must be
109
# intentionally broken and are kept
112
# [-p <n> ] | [--prewhite <n> ]
113
$preformat_whitespace_min = 5; # Minimum number of consecutive leading
114
# whitespace characters to trigger
116
# NOTE: Tabs are now expanded to
117
# spaces before this check is made.
118
# That means if $tab_width is 8 and
119
# this is 5, then one tab is expanded
120
# to 8 spaces, which is enough to
121
# trigger preformatting.
123
# [-pb <n> ] | [--prebegin <n> ]
124
$preformat_trigger_lines = 2; # How many lines of preformatted-looking
125
# text are needed to switch to <PRE>
126
# <= 0 : Preformat entire document
127
# 1 : one line triggers
128
# >= 2 : two lines trigger
130
# [-pe <n> ] | [--preend <n> ]
131
$endpreformat_trigger_lines = 2; # How many lines of unpreformatted-looking
132
# text are needed to switch from <PRE>
133
# <= 0 : Never preformat within document
134
# 1 : one line triggers
135
# >= 2 : two lines trigger
136
# NOTE for --prebegin and --preend:
137
# A zero takes precedence. If one is zero, the other is ignored.
138
# If both are zero, entire document is preformatted.
141
# [-r <n> ] | [--hrule <n> ]
142
$hrule_min = 4; # Min number of ---s for an HRule.
144
# [-c <n> ] | [--caps <n> ]
145
$min_caps_length = 3; # min sequential CAPS for an all-caps line
147
# [-ct <tag> ] | [--capstag <tag> ]
148
$caps_tag = "STRONG"; # Tag to put around all-caps lines
150
# [-m/+m ] | [--mail / --nomail ]
151
$mailmode = 0; # Deal with mail headers & quoted text
153
# [-u/+u ] | [--unhyphenate / --nounhyphenate ]
154
$unhyphenation = 1; # Enables unhyphenation of text.
156
# [-a <file> ] | [--append <file> ]
157
# [+a ] | [--noappend ]
158
$append_file = 0; # If you want something appended by
159
# default, put the filename here.
160
# The appended text will not be
161
# processed at all, so make sure it's
162
# plain text or decent HTML. i.e. do
163
# not have things like:
164
# Seth Golub <seth@cs.wustl.edu>
166
# Seth Golub <seth@cs.wustl.edu>
168
# [-t <title>] | [--title <title> ]
169
$title = 0; # You can specify a title.
170
# Otherwise it won't put one in.
172
# [-ul <n> ] | [--underlinelong <n> ]
173
$underline_tolerance_long = 1; # How much longer can underlines
174
# be and still be underlines?
176
# [-us <n> ] | [--underlineshort <n> ]
177
$underline_tolerance_short = 1; # How much shorter can underlines
178
# be and still be underlines?
180
# [-tw <n> ] | [--tabwidth <n> ]
181
$tab_width = 8; # How many spaces equal a tab?
184
# [-iw <n> ] | [--indent <n> ]
185
$indent_width = 2; # Indents this many spaces for each
188
# [-/+e ] | [--extract / --noextract ]
189
$extract = 0; # Extract Mode (suitable for inserting)
191
# END OF CONFIGURABLE OPTIONS
192
########################################
195
########################################
196
# Definitions (Don't change these)
216
local($s) = " " x length($0);
222
$s [-v ] | [--version ]
224
$s [-s <n> ] | [--shortline <n> ]
225
$s [-p <n> ] | [--prewhite <n> ]
226
$s [-pb <n> ] | [--prebegin <n> ]
227
$s [-pe <n> ] | [--preend <n> ]
228
$s [-e/+e ] | [--extract / --noextract ]
229
$s [-r <n> ] | [--hrule <n> ]
230
$s [-c <n> ] | [--caps <n> ]
231
$s [-ct <tag> ] | [--capstag <tag> ]
232
$s [-m/+m ] | [--mail / --nomail ]
233
$s [-u/+u ] | [--unhyphen / --nounhyphen ]
234
$s [-a <file> ] | [--append <file> ]
235
$s [+a ] | [--noappend ]
236
$s [-t <title>] | [--title <title> ]
237
$s [-tw <n> ] | [--tabwidth <n> ]
238
$s [-iw <n> ] | [--indent <n> ]
239
$s [-ul <n> ] | [--underlinelong <n> ]
240
$s [-us <n> ] | [--underlineshort <n> ]
242
More complete explanations of these options can be found in
243
comments near the beginning of the script.
249
sub deal_with_options
251
while ($ARGV[0] =~ /^[-+].+/)
253
if (($ARGV[0] eq "-r" || $ARGV[0] eq "--hrule") &&
256
$hrule_min = $ARGV[1];
261
if (($ARGV[0] eq "-s" || $ARGV[0] eq "--shortline") &&
264
$short_line_length = $ARGV[1];
269
if (($ARGV[0] eq "-p" || $ARGV[0] eq "--prewhite") &&
272
$preformat_whitespace_min = $ARGV[1];
277
if (($ARGV[0] eq "-pb" || $ARGV[0] eq "--prebegin") &&
280
$preformat_trigger_lines = $ARGV[1];
285
if (($ARGV[0] eq "-pe" || $ARGV[0] eq "--preend") &&
288
$endpreformat_trigger_lines = $ARGV[1];
293
if (($ARGV[0] eq "-e" || $ARGV[0] eq "--extract"))
300
if (($ARGV[0] eq "+e" || $ARGV[0] eq "--noextract"))
307
if (($ARGV[0] eq "-c" || $ARGV[0] eq "--caps") &&
310
$min_caps_length = $ARGV[1];
315
if (($ARGV[0] eq "-ct" || $ARGV[0] eq "--capstag") &&
318
$caps_tag = $ARGV[1];
323
if ($ARGV[0] eq "-m" || $ARGV[0] eq "--mail")
329
if ($ARGV[0] eq "+m" || $ARGV[0] eq "--nomail")
335
if ($ARGV[0] eq "-u" || $ARGV[0] eq "--unhyphen")
341
if ($ARGV[0] eq "+u" || $ARGV[0] eq "--nounhyphen")
347
if (($ARGV[0] eq "-a" || $ARGV[0] eq "--append") &&
351
$append_file = $ARGV[1];
353
print STDERR "Can't find or read $ARGV[1].\n";
359
if ($ARGV[0] eq "+a" || $ARGV[0] eq "--noappend")
365
if (($ARGV[0] eq "-t" || $ARGV[0] eq "--title") &&
373
if (($ARGV[0] eq "-ul" || $ARGV[0] eq "--underlinelong") &&
376
$underline_tolerance_long = $ARGV[1];
381
if (($ARGV[0] eq "-us" || $ARGV[0] eq "--underlineshort") &&
384
$underline_tolerance_short = $ARGV[1];
389
if (($ARGV[0] eq "-tw" || $ARGV[0] eq "--tabwidth") &&
392
$tab_width = $ARGV[1];
397
if (($ARGV[0] eq "-iw" || $ARGV[0] eq "--indentwidth") &&
400
$indent_width = $ARGV[1];
405
if ($ARGV[0] eq "-v" || $ARGV[0] eq "--version")
407
print '$Header: /users/hilco/seth/projects/txt2html/txt2html.pl,v 1
408
.10 1994/12/28 20:10:25 seth Exp seth $ ';
413
if ($ARGV[0] eq "-h" || $ARGV[0] eq "--help")
419
print STDERR "Unrecognized option: $ARGV[0]\n";
420
print STDERR " or bad paramater: $ARGV[1]\n" if($ARGV[1]);
430
$preformat_trigger_lines = 0 if ($preformat_trigger_lines < 0);
431
$preformat_trigger_lines = 2 if ($preformat_trigger_lines > 2);
433
$endpreformat_trigger_lines = 1 if ($preformat_trigger_lines == 0);
434
$endpreformat_trigger_lines = 0 if ($endpreformat_trigger_lines < 0);
435
$endpreformat_trigger_lines = 2 if ($endpreformat_trigger_lines > 2);
437
$underline_tolerance_long = 0 if $underline_tolerance_long < 0;
438
$underline_tolerance_short = 0 if $underline_tolerance_short < 0;
443
return $_[0] =~ /^\s*$/;
448
$line =~ s/&/&/g;
451
$line =~ s/\014/\n<HR>\n/g; # Linefeeds become horizontal rules
456
if ($line =~ /^\s*([-_~=\*]\s*){$hrule_min,}$/)
460
$line_action |= $HRULE;
466
if (!($mode & $PRE) &&
468
($line_length < $short_line_length) &&
469
!&is_blank($nextline) &&
470
!($line_action & ($HEADER | $HRULE | $BREAK | $LIST)))
473
$line_action |= $BREAK;
479
if ((($line =~ /^\w*>/) || # Handle "FF> Werewolves."
480
($line =~ /^\w*\|/))&& # Handle "Igor| There wolves."
481
!&is_blank($nextline))
484
$line_action |= $BREAK | $MAILQUOTE;
485
} elsif (($line =~ /^[\w\-]*:/) # Handle "Some-Header: blah"
486
&& (($previous_action & $MAILHEADER) || &is_blank($prev))
487
&& !&is_blank($nextline))
489
&anchor_mail if !($previous_action & $MAILHEADER);
491
$line_action |= $BREAK | $MAILHEADER;
492
} elsif (($line =~ /^\s+\S/) && # Handle multi-line mail headers
493
($previous_action & $MAILHEADER) &&
494
!&is_blank($nextline))
497
$line_action |= $BREAK | $MAILHEADER;
504
$line_action |= $PAR;
510
local($prefix, $number, $rawprefix);
512
return (0,0,0) if (!($line =~ /^\s*[-=\*o]\s+\S/ ) &&
513
!($line =~ /^\s*(\d+|[a-zA-Z])[\.\)\]:]\s+\S/ ));
515
($number) = $line =~ /^\s*(\d+|[a-zA-Z])/;
517
# That slippery exception of "o" as a bullet
518
# (This ought to be determined more through the context of what lists
519
# we have in progress, but this will probably work well enough.)
520
if($line =~ /^\s*o\s/)
527
($rawprefix) = $line =~ /^(\s*(\d+|[a-zA-Z]).)/;
528
$prefix = $rawprefix;
529
$prefix =~ s/(\d+|[a-zA-Z])//; # Take the number out
531
($rawprefix) = $line =~ /^(\s*[-=o\*].)/;
532
$prefix = $rawprefix;
534
($prefix, $number, $rawprefix);
539
local($prefix, $number, $rawprefix) = @_;
541
$listprefix[$listnum] = $prefix;
544
# It doesn't start with 1,a,A. Let's not screw with it.
545
if (($number != 1) && ($number ne "a") && ($number ne "A"))
549
$prev .= "$list_indent<OL>\n";
550
$list[$listnum] = $OL;
552
$prev .= "$list_indent<UL>\n";
553
$list[$listnum] = $UL;
556
$list_indent = " " x $listnum x $indent_width;
557
$line_action |= $LIST;
562
sub endlist # End N lists
565
for(; $n > 0; $n--, $listnum--)
567
$list_indent = " " x ($listnum-1) x $indent_width;
568
if($list[$listnum-1] == $UL)
570
$prev .= "$list_indent</UL>\n";
571
} elsif($list[$listnum-1] == $OL)
573
$prev .= "$list_indent</OL>\n";
576
print STDERR "Encountered list of unknown type\n";
579
$line_action |= $END;
580
$mode ^= ($LIST & $mode) if (!$listnum);
585
$line =~ s/^\s*[-=o\*]\s*/$list_indent<LI> / if $list[$listnum-1] == $UL;
586
$line =~ s/^\s*(\d+|[a-zA-Z]).\s*/$list_indent<LI> / if $list[$listnum-1
588
$line_action |= $LIST;
595
local($prefix, $number, $rawprefix) = &listprefix($line);
600
return if !&is_blank($prev); # inside a list item
602
# This ain't no list. We'll want to end all of them.
603
return if !($mode & $LIST); # This just speeds up the inevitable
607
# Maybe we're going back up to a previous list
608
$i-- while (($prefix ne $listprefix[$i-1]) && ($i >= 0));
611
if (($i >= 0) && ($i != $listnum))
613
&endlist($listnum - $i);
614
} elsif (!$listnum || $i != $listnum)
616
&startlist($prefix, $number, $rawprefix);
619
&continuelist($prefix, $number, $rawprefix) if ($mode & $LIST);
624
if(!($line =~ /\s{$preformat_whitespace_min,}\S+/) &&
625
($endpreformat_trigger_lines == 1 ||
626
!($nextline =~ /\s{$preformat_whitespace_min,}\S+/)))
628
$prev =~ s#$#\n</PRE>#;
629
$mode ^= ($PRE & $mode);
630
$line_action |= $END;
636
if($preformat_trigger_lines == 0 ||
637
(($line =~ /\s{$preformat_whitespace_min,}\S+/) &&
638
($preformat_trigger_lines == 1 ||
639
$nextline =~ /\s{$preformat_whitespace_min,}\S+/)))
641
$line =~ s/^/<PRE>\n/;
644
$line_action |= $PRE;
656
local($text) = $line =~ /\S+: *(.*) *$/;
657
local($anchor) = &make_new_anchor($text);
658
$line =~ s/(.*)/<A NAME="$anchor">$1<\/A>/;
663
local($heading) = @_;
664
local($anchor) = &make_new_anchor($heading);
665
$line =~ s/(<H.>.*<\/H.>)/<A NAME="$anchor">$1<\/A>/;
670
local($hindent, $heading) = $line =~ /^(\s*)(.+)$/;
671
$hindent = 0; # This isn't used yet, but Perl warns of
672
# "possible typo" if I declare a var
673
# and never reference it.
675
# This is now taken care of in main()
676
# $heading =~ s/\s+$//; # get rid of trailing whitespace.
678
local($underline) = $nextline =~ /^\s*(\S+)\s*$/;
680
if((length($heading) > (length($underline) + $underline_tolerance_short))
681
|| (length($heading) < (length($underline) -$underline_tolerance_long)))
686
# $underline =~ s/(^.).*/$1/; # Could I do this any less efficiently?
687
$underline = substr($underline,0,1);
690
$hlevel = 1 if $underline eq "*";
691
$hlevel = 2 if $underline eq "=";
692
$hlevel = 3 if $underline eq "+";
693
$hlevel = 4 if $underline eq "-";
694
$hlevel = 5 if $underline eq "~";
695
$hlevel = 6 if $underline eq ".";
698
$nextline = <STDIN>; # Eat the underline
699
&tagline("H${hlevel}");
700
&anchor_heading($heading);
701
$line_action |= $HEADER;
708
# This looks hairy because of all the quoted characters.
709
# All I'm doing is pulling out the word that begins the next line.
710
# Along with it, I pull out any punctuation that follows.
711
# Preceding whitespace is preserved. We don't want to screw up
712
# our own guessing systems that rely on indentation.
713
($second) = $nextline =~ /^\s*([a-zA-Z]+[\)\}\]\.,:;\'\"\>]*\s*)/; # "
714
$nextline =~ s/^(\s*)[a-zA-Z]+[\)\}\]\.,:;\'\"\>]*\s*/$1/; # "
715
# (The silly comments are for my less-than-perfect code hilighter)
717
$line =~ s/\-\s*$/$second/;
723
local($oldws) = $line =~ /^([ \011]+)/;
724
local($oldlen) = (length($oldws));
727
for($i=0, $column = 0; $i < $oldlen; $i++)
729
if(substr($oldws, $i, 1) eq " ")
733
$column += $tab_width - ($column % $tab_width);
736
$line = (" " x $column) . substr($line, $oldlen);
742
$line =~ s/^\s*(.*)\s*$/<$tag>$1<\/$tag>\n/;
747
if($line =~ /^[^a-z<]*[A-Z]{$min_caps_length,}[^a-z<]*$/)
750
$line_action |= $CAPS;
762
print q(<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN") . "\n";
763
print q( "http://www.w3.org/TR/html4/loose.dtd">) . "\n";
767
# It'd be nice if we could guess a title from the first header,
768
# but even that would be too late if we're doing this in one pass.
769
print "<TITLE>$title</TITLE>\n" if($title);
773
@page { size: 8.5in 11in }
774
TD P { color: #000000; font-family: "Verdana", "Arial", "Helvetica", sans-serif; font-size: 12pt }
775
P { color: #000000; font-family: "Verdana", "Arial", "Helvetica", sans-serif; font-size: 12pt }
776
A:link { color: #0085c0 }
777
A:visited { color: #800080 }
784
print q(<body LANG="en-US" TEXT="#000000" LINK="#0085c0" VLINK="#800080" BGCOLOR="#ffffff">) . "\n";
787
print q(<table BORDER=0 WIDTH="100%" >) . "\n";
788
print q(<tr>) . "\n";
789
print q(<td BGCOLOR="#52799E"><img SRC="../images/right_triangle.png" ALT=">" height=14 width=15>) . "\n";
790
print q(<b><font face="Helvetica, Arial"><font color="#FFFFFF"><font size="+1">);
792
print q(</font></font></font></b></td>) . "\n";
793
print q(</tr>) . "\n";
794
print q(</table>) . "\n";
803
$line =~ s/[ \011]*$//; # Chop trailing whitespace
805
&untabify; # Change leading whitespace into spaces
807
$line_length = length($line); # Do this before tags go in
811
&endpreformat if (($mode & $PRE) && ($preformat_trigger_lines != 0));
813
&hrule if !($mode & $PRE);
815
&heading if (!($mode & $PRE) &&
816
$nextline =~ /^\s*[=\-\*\.~\+]+$/);
818
&caps if !($mode & $PRE);
820
&liststuff if (!($mode & $PRE) &&
823
&mailstuff if ($mailmode &&
825
!($line_action & $HEADER));
827
&preformat if (!($line_action & ($HEADER | $LIST | $MAILHEADER)) &&
828
!($mode & ($LIST | $PRE)) &&
829
($endpreformat_trigger_lines != 0));
831
¶graph if ((&is_blank($prev) || ($line_action & $END)) &&
833
!($mode & ($LIST | $PRE)) && # paragraphs in lists
834
# *should* be allowed.
836
($line_action & ($CAPS | $END | $MAILQUOTE))));
840
&unhyphenate if ($unhyphenation &&
841
($line =~ /[a-zA-Z]\-$/) && # ends in hyphen
842
# next line starts w/letters
843
($nextline =~ /^\s*[a-zA-Z]/) &&
844
!($mode & ($PRE | $HEADER | $MAILHEADER | $BREAK)));
847
# Print it out and move on.
851
if (!&is_blank($nextline))
853
$previous_action = $line_action;
854
$line_action = $NONE;
860
} until (!$nextline && !$line && !$prev);
863
&endlist($listnum) if ($mode & $LIST); # End all lists
868
print "</PRE>\n" if ($mode & $PRE);
874
open(APPEND, $append_file);
875
print while <APPEND>;
877
print STDERR "Can't find or read file $append_file to append.\n";
880
print q(<hr>) . "\n";
881
print q(<P ALIGN=CENTER><FONT FACE="Verdana, Arial, Helvetica, sans-seri"><A HREF="Copyright.html">Copyright</A>
882
<FONT FACE="Abadi Mt Condensed Extra Bold">©</FONT>
883
GraphicsMagick Group 2002, 2003, 2004</FONT></P>);