2
# html2texi.pl -- Convert HTML documentation to Texinfo format
3
# Michael Ernst <mernst@cs.washington.edu>
4
# Time-stamp: <1999-01-12 21:34:27 mernst>
6
# This program converts HTML documentation trees into Texinfo format.
7
# Given the name of a main (or contents) HTML file, it processes that file,
8
# and other files (transitively) referenced by it, into a Texinfo file
9
# (whose name is chosen from the file or directory name of the argument).
11
# html2texi.pl api/index.html
12
# produces file "api.texi".
14
# Texinfo format can be easily converted to Info format (for browsing in
15
# Emacs or the standalone Info browser), to a printed manual, or to HTML.
16
# Thus, html2texi.pl permits conversion of HTML files to Info format, and
17
# secondarily enables producing printed versions of Web page hierarchies.
19
# Unlike HTML, Info format is searchable. Since Info is integrated into
20
# Emacs, one can read documentation without starting a separate Web
21
# browser. Additionally, Info browsers (including Emacs) contain
22
# convenient features missing from Web browsers, such as easy index lookup
23
# and mouse-free browsing.
26
# html2texi.pl is currently tuned to latex2html output (and it corrects
27
# several latex2html bugs), but should be extensible to arbitrary HTML
28
# documents. It will be most useful for HTML with a hierarchical structure
29
# and an index, and it recognizes those features as created by latex2html
30
# (and possibly by some other tools). The HTML tree to be traversed must
31
# be on local disk, rather than being accessed via HTTP.
32
# This script requires the use of "checkargs.pm". To eliminate that
33
# dependence, replace calls to check_args* by @_ (which is always the last
34
# argument to those functions).
35
# Also see the "to do" section, below.
36
# Comments, suggestions, bug fixes, and enhancements are welcome.
39
# Malformed HTML can cause this program to abort, so
40
# you should check your HTML files to make sure they are legal.
44
### Typical usage for the Python documentation:
47
# (Actually, most of this is in a Makefile instead.)
48
# The resulting Info format Python documentation is currently available at
49
# ftp://ftp.cs.washington.edu/homes/mernst/python-info.tar.gz
51
# Fix up HTML problems, eg <DT><DL COMPACT><DD> should be <DT><DL COMPACT><DD>.
53
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/api/index.html
54
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ext/index.html
55
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/lib/index.html
56
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/mac/index.html
57
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ref/index.html
58
# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/tut/index.html
60
# Edit the generated .texi files:
61
# * change @setfilename to prefix "python-"
62
# * fix up any sectioning, such as for Abstract
63
# * make Texinfo menus
64
# * perhaps remove the @detailmenu ... @end detailmenu
65
# In Emacs, to do all this:
66
# (progn (goto-char (point-min)) (replace-regexp "\\(@setfilename \\)\\([-a-z]*\\)$" "\\1python-\\2.info") (replace-string "@node Front Matter\n@chapter Abstract\n" "@node Abstract\n@section Abstract\n") (progn (mark-whole-buffer) (texinfo-master-menu 'update-all-nodes)) (save-buffer))
77
### Structure of the code
87
# Source and destination languages
88
# --------------------------------
90
# The goal is Info files; I create Texinfo, so I don't have to worry about
91
# the finer details of Info file creation. (I'm not even sure of its exact
94
# Why not start from LaTeX rather than HTML?
95
# I could hack latex2html itself to produce Texinfo instead, or fix up
96
# partparse.py (which already translates LaTeX to Teinfo).
98
# * has high-level information such as index entries, original formatting
100
# * those programs are complicated to read and understand
101
# * those programs try to handle arbitrary LaTeX input, track catcodes,
102
# and more: I don't want to go to that effort. HTML isn't as powerful
103
# as LaTeX, so there are fewer subtleties.
104
# * the result wouldn't work for arbitrary HTML documents; it would be
105
# nice to eventually extend this program to HTML produced from Docbook,
111
# I don't want to view the text as a linear stream; I'd rather parse the
112
# whole thing and then do pattern matching over the parsed representation (to
113
# find idioms such as indices, lists of child nodes, etc.).
114
# * Perl provides HTML::TreeBuilder, which does just what I want.
115
# * libwww-perl: http://www.linpro.no/lwp/
116
# * TreeBuilder: HTML-Tree-0.51.tar.gz
117
# * Python Parsers, Formatters, and Writers don't really provide the right
118
# interface (and the version in Grail doesn't correspond to another
119
# distributed version, so I'm confused about which to be using). I could
120
# write something in Python that creates a parse tree, but why bother?
122
# Other implementation language issues:
123
# * Python lacks variable declarations, reasonable scoping, and static
124
# checking tools. I've written some of the latter for myself that make
125
# my Perl programming a lot safer than my Python programming will be until
126
# I have a similar suite for that language.
129
###########################################################################
134
# Fix the problem with multiple sections in a single file (eg, Abstract in
135
# Front Matter section).
136
# Deal with cross-references, as in /homes/fish/mernst/tmp/python-doc/html/ref/types.html:310
138
# Perhaps double-check that every tag mentioned in the index is found
140
# Python: email to python-docs@python.org, to get their feedback.
141
# Compare to existing lib/ Info manual
142
# Write the hooks into info-look; replace pyliblookup1-1.tar.gz.
143
# Postpass to remove extra quotation marks around typography already in
144
# a different font (to avoid double delimiters as in "`code'"); or
145
# perhaps consider using only font-based markup so that we don't get
146
# the extra *bold* and `code' markup in Info.
148
## Perhaps don't rely on automatic means for adding up, next, prev; I have
149
## all that info available to me already, so it's not so much trouble to
150
## add it. (Right?) But it is *so* easy to use Emacs instead...
153
###########################################################################
157
# man HTML::TreeBuilder
161
# require HTML::ParserWComment;
162
require HTML::Parser;
163
require HTML::TreeBuilder;
164
require HTML::Element;
174
###########################################################################
178
my @section_stack = (); # elements are chapter/section/subsec nodetitles (I think)
179
my $current_ref_tdf; # for the file currently being processed;
180
# used in error messages
184
# First element should not be used.
185
my @sectionmarker = ("manual", "chapter", "section", "subsection", "subsubsection");
187
my %inline_markup = ("b" => "strong",
192
"strong" => "strong",
196
my @deferred_index_entries = ();
198
my @index_titles = (); # list of (filename, type) lists
199
my %index_info = ("Index" => ["\@blindex", "bl"],
200
"Concept Index" => ["\@cindex", "cp"],
201
"Module Index" => ["\@mdindex", "md"]);
204
###########################################################################
205
### Main/contents page
208
# Process first-level page on its own, or just a contents page? Well, I do
209
# want the title, author, etc., and the front matter... For now, just add
210
# that by hand at the end.
213
# data structure possibilities:
214
# * tree-like (need some kind of stack when processing (or parent pointers))
215
# * list of name and depth; remember old and new depths.
217
# Each element is a reference to a list of (nodetitle, depth, filename).
218
my @contents_list = ();
220
# The problem with doing fixups on the fly is that some sections may have
221
# already been processed (and no longer available) by the time we notice
222
# others with the same name. It's probably better to fully construct the
223
# contents list (reading in all files of interest) upfront; that will also
224
# let me do a better job with cross-references, because again, all files
225
# will already be read in.
226
my %contents_hash = ();
227
my %contents_fixups = ();
229
my @current_contents_list = ();
231
# Merge @current_contents_list into @contents_list,
232
# and set @current_contents_list to be empty.
233
sub merge_contents_lists ( )
236
# Three possibilities:
237
# * @contents_list is empty: replace it by @current_contents_list.
238
# * prefixes of the two lists are identical: do nothing
239
# * @current_contents_list is all at lower level than $contents_list[0];
240
# prefix @contents_list by @current_contents_list
242
if (scalar(@current_contents_list) == 0)
243
{ die "empty current_contents_list"; }
245
# if (scalar(@contents_list) == 0)
246
# { @contents_list = @current_contents_list;
247
# @current_contents_list = ();
250
# if (($ {$contents_list[0]}[1]) < ($ {$current_contents_list[0]}[1]))
251
# { unshift @contents_list, @current_contents_list;
252
# @current_contents_list = ();
255
for (my $i=0; $i<scalar(@current_contents_list); $i++)
256
{ my $ref_c_tdf = $current_contents_list[$i];
257
if ($i >= scalar(@contents_list))
258
{ push @contents_list, $ref_c_tdf;
259
my $title = $ {$ref_c_tdf}[0];
260
if (defined $contents_hash{$title})
261
{ $contents_fixups{$title} = 1; }
263
{ $contents_hash{$title} = 1; }
265
my $ref_tdf = $contents_list[$i];
266
my ($title, $depth, $file) = @{$ref_tdf};
267
my ($c_title, $c_depth, $c_file) = @{$ref_c_tdf};
269
if (($title ne $c_title)
270
&& ($depth < $c_depth)
271
&& ($file ne $c_file))
272
{ splice @contents_list, $i, 0, $ref_c_tdf;
273
if (defined $contents_hash{$c_title})
274
{ $contents_fixups{$c_title} = 1; }
276
{ $contents_hash{$c_title} = 1; }
279
if (($title ne $c_title)
280
|| ($depth != $c_depth)
281
|| ($file ne $c_file))
282
{ die ("while processing $ {$current_ref_tdf}[2] at depth $ {$current_ref_tdf}[1], mismatch at index $i:",
283
"\n main: <<<$title>>> $depth $file",
284
"\n curr: <<<$c_title>>> $c_depth $c_file"); }
286
@current_contents_list = ();
291
# Set @current_contents_list to a list of (title, href, sectionlevel);
292
# then merge that list into @contents_list.
293
# Maybe this function should also produce a map
294
# from title (or href) to sectionlevel (eg "chapter"?).
295
sub process_child_links ( $ )
296
{ my ($he) = check_args(1, @_);
299
if (scalar(@current_contents_list) != 0)
300
{ die "current_contents_list nonempty: @current_contents_list"; }
301
$he->traverse(\&increment_current_contents_list, 'ignore text');
303
# Normalize the depths; for instance, convert 1,3,5 into 0,1,2.
305
for my $ref_tdf (@current_contents_list)
306
{ $depths{$ {$ref_tdf}[1]} = 1; }
307
my @sorted_depths = sort keys %depths;
308
my $current_depth = scalar(@section_stack)-1;
309
my $current_depth_2 = $ {$current_ref_tdf}[1];
310
if ($current_depth != $current_depth_2)
311
{ die "mismatch in current depths: $current_depth $current_depth_2; ", join(", ", @section_stack); }
312
for (my $i=0; $i<scalar(@sorted_depths); $i++)
313
{ $depths{$sorted_depths[$i]} = $i + $current_depth+1; }
314
for my $ref_tdf (@current_contents_list)
315
{ $ {$ref_tdf}[1] = $depths{$ {$ref_tdf}[1]}; }
317
# Eliminate uninteresting sections. Hard-coded hack for now.
318
if ($ {$current_contents_list[-1]}[0] eq "About this document ...")
319
{ pop @current_contents_list; }
320
if ((scalar(@current_contents_list) > 1)
321
&& ($ {$current_contents_list[1]}[0] eq "Contents"))
322
{ my $ref_first_tdf = shift @current_contents_list;
323
$current_contents_list[0] = $ref_first_tdf; }
325
for (my $i=0; $i<scalar(@current_contents_list); $i++)
326
{ my $ref_tdf = $current_contents_list[$i];
327
my $title = $ {$ref_tdf}[0];
328
if (exists $index_info{$title})
329
{ my $index_file = $ {$ref_tdf}[2];
330
my ($indexing_command, $suffix) = @{$index_info{$title}};
331
process_index_file($index_file, $indexing_command);
332
print TEXI "\n\@defindex $suffix\n";
333
push @index_titles, $title;
334
splice @current_contents_list, $i, 1;
336
elsif ($title =~ /\bIndex$/)
337
{ print STDERR "Warning: \"$title\" might be an index; if so, edit \%index_info.\n"; } }
339
merge_contents_lists();
341
# print_contents_list();
342
# print_index_info();
346
sub increment_current_contents_list ( $$$ )
347
{ my ($he, $startflag, $depth) = check_args(3, @_);
351
if ($he->tag eq "li")
352
{ my @li_content = @{$he->content};
353
if ($li_content[0]->tag ne "a")
354
{ die "first element of <LI> should be <A>"; }
355
my ($name, $href, @content) = anchor_info($li_content[0]);
357
my $title = join("", collect_texts($li_content[0]));
358
$title = texi_remove_punctuation($title);
359
# The problem with these is that they are formatted differently in
363
$title =~ s/ -- / /g;
364
push @current_contents_list, [ $title, $depth, $href ]; }
368
# Simple version for section titles
369
sub html_to_texi ( $ )
370
{ my ($he) = check_args(1, @_);
375
if (exists $inline_markup{$tag})
376
{ my $result = "\@$inline_markup{$tag}\{";
377
for my $elt (@{$he->content})
378
{ $result .= html_to_texi($elt); }
383
die "html_to_texi confused by <$tag>"; }
388
sub print_contents_list ()
390
print STDERR "Contents list:\n";
391
for my $ref_tdf (@contents_list)
392
{ my ($title, $depth, $file) = @{$ref_tdf};
393
print STDERR "$title $depth $file\n"; }
398
###########################################################################
402
my $l2h_broken_link_name = "l2h-";
405
# map from file to (map from anchor name to (list of index texts))
406
# (The list is needed when a single LaTeX command like \envvar
407
# expands to multiple \index commands.)
408
my %file_index_entries = ();
409
my %this_index_entries; # map from anchor name to (list of index texts)
411
my %file_index_entries_broken = (); # map from file to (list of index texts)
412
my @this_index_entries_broken;
414
my $index_prefix = "";
415
my @index_prefixes = ();
417
my $this_indexing_command;
419
sub print_index_info ()
422
for my $file (sort keys %file_index_entries)
423
{ my %index_entries = %{$file_index_entries{$file}};
424
print STDERR "file: $file\n";
425
for my $aname (sort keys %index_entries)
426
{ my @entries = @{$index_entries{$aname}};
427
if (scalar(@entries) == 1)
428
{ print STDERR " $aname : $entries[0]\n"; }
430
{ print STDERR " $aname : ", join("\n " . (" " x length($aname)), @entries), "\n"; } } }
431
for my $file (sort keys %file_index_entries_broken)
432
{ my @entries = @{$file_index_entries_broken{$file}};
433
print STDERR "file: $file\n";
434
for my $entry (@entries)
435
{ print STDERR " $entry\n"; }
440
sub process_index_file ( $$ )
441
{ my ($file, $indexing_command) = check_args(2, @_);
442
# print "process_index_file $file $indexing_command\n";
444
my $he = file_to_tree($html_directory . $file);
447
$this_indexing_command = $indexing_command;
448
$he->traverse(\&process_if_index_dl_compact, 'ignore text');
449
undef $this_indexing_command;
450
# print "process_index_file done\n";
454
sub process_if_index_dl_compact ( $$$ )
455
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
459
if (($he->tag() eq "dl") && (defined $he->attr('compact')))
460
{ process_index_dl_compact($he);
467
# The elements of a <DL COMPACT> list from a LaTeX2HTML index:
468
# * a single space: text to be ignored
469
# * <DT> elements with an optional <DD> element following each one
470
# Two types of <DT> elements:
471
# * Followed by a <DD> element: the <DT> contains a single
472
# string, and the <DD> contains a whitespace string to be ignored, a
473
# <DL COMPACT> to be recursively processed (with the <DT> string as a
474
# prefix), and a whitespace string to be ignored.
475
# * Not followed by a <DD> element: contains a list of anchors
476
# and texts (ignore the texts, which are only whitespace and commas).
477
# Optionally contains a <DL COMPACT> to be recursively processed (with
478
# the <DT> string as a prefix)
479
sub process_index_dl_compact ( $ )
480
{ my ($h) = check_args(1, @_);
481
my @content = @{$h->content()};
482
for (my $i = 0; $i < scalar(@content); $i++)
483
{ my $this_he = $content[$i];
484
if ($this_he->tag ne "dt")
486
die "Expected <DT> tag: " . $this_he->tag; }
487
if (($i < scalar(@content) - 1) && ($content[$i+1]->tag eq "dd"))
488
{ process_index_dt_and_dd($this_he, $content[$i+1]);
491
{ process_index_lone_dt($this_he); } } }
495
# Argument is a <DT> element. If it contains more than one anchor, then
496
# the texts of all subsequent ones are "[Link]". Example:
498
# <A HREF="embedding.html#l2h-201">
501
# <A HREF="embedding.html#l2h-205">
503
# Optionally contains a <DL COMPACT> as well. Example:
505
# <A HREF="types.html#l2h-616">
509
# <A HREF="assignment.html#l2h-3074">
512
# <A HREF="assignment.html#l2h-3099">
515
# <A HREF="types.html#l2h-">
516
# "assignment, class"
518
sub process_index_lone_dt ( $ )
519
{ my ($dt) = check_args(1, @_);
520
my @dtcontent = @{$dt->content()};
523
for my $a (@dtcontent)
528
die "Unexpected <DT> string element: $a"; }
531
{ push @index_prefixes, $index_prefix;
532
if (!defined $acontent_suffix)
533
{ die "acontent_suffix not yet defined"; }
534
$index_prefix .= $acontent_suffix . ", ";
535
process_index_dl_compact($a);
536
$index_prefix = pop(@index_prefixes);
542
die "Expected anchor in lone <DT>"; }
544
my ($aname, $ahref, @acontent) = anchor_info($a);
546
if (scalar(@acontent) != 1)
547
{ die "Expected just one content of <A> in <DT>: @acontent"; }
548
if (ref $acontent[0])
549
{ $acontent[0]->dump;
550
die "Expected string content of <A> in <DT>: $acontent[0]"; }
551
if (!defined($acontent))
552
{ $acontent = $index_prefix . $acontent[0];
553
$acontent_suffix = $acontent[0]; }
554
elsif (($acontent[0] ne "[Link]") && ($acontent ne ($index_prefix . $acontent[0])))
555
{ die "Differing content: <<<$acontent>>>, <<<$acontent[0]>>>"; }
559
die "no HREF in nachor in <DT>"; }
560
my ($ahref_file, $ahref_name) = split(/\#/, $ahref);
561
if (!defined $ahref_name)
562
{ # Reference to entire file
565
if ($ahref_name eq $l2h_broken_link_name)
566
{ if (!exists $file_index_entries_broken{$ahref_file})
567
{ $file_index_entries_broken{$ahref_file} = []; }
568
push @{$file_index_entries_broken{$ahref_file}}, "$this_indexing_command $acontent";
571
if (!exists $file_index_entries{$ahref_file})
572
{ $file_index_entries{$ahref_file} = {}; }
573
# Don't do this! It appears to make a copy, which is not desired.
574
# my %index_entries = %{$file_index_entries{$ahref_file}};
575
if (!exists $ {$file_index_entries{$ahref_file}}{$ahref_name})
576
{ $ {$file_index_entries{$ahref_file}}{$ahref_name} = []; }
577
# { my $oldcontent = $ {$file_index_entries{$ahref_file}}{$ahref_name};
578
# if ($acontent eq $oldcontent)
579
# { die "Multiple identical index entries?"; }
580
# die "Trying to add $acontent, but already have index entry pointing at $ahref_file\#$ahref_name: ${$file_index_entries{$ahref_file}}{$ahref_name}"; }
582
push @{$ {$file_index_entries{$ahref_file}}{$ahref_name}}, "$this_indexing_command $acontent";
583
# print STDERR "keys: ", keys %{$file_index_entries{$ahref_file}}, "\n";
587
sub process_index_dt_and_dd ( $$ )
588
{ my ($dt, $dd) = check_args(2, @_);
590
{ my @dtcontent = @{$dt->content()};
591
if ((scalar(@dtcontent) != 1) || (ref $dtcontent[0]))
594
die "Expected single string (actual size = " . scalar(@dtcontent) . ") in content of <DT>: @dtcontent"; }
595
$dtcontent = $dtcontent[0];
596
$dtcontent =~ s/ +$//; }
598
{ my @ddcontent = @{$dd->content()};
599
if (scalar(@ddcontent) != 1)
600
{ die "Expected single <DD> content, got ", scalar(@ddcontent), " elements:\n", join("\n", @ddcontent), "\n "; }
601
$ddcontent = $ddcontent[0]; }
602
if ($ddcontent->tag ne "dl")
603
{ die "Expected <DL> as content of <DD>, but saw: $ddcontent"; }
605
push @index_prefixes, $index_prefix;
606
$index_prefix .= $dtcontent . ", ";
607
process_index_dl_compact($ddcontent);
608
$index_prefix = pop(@index_prefixes);
612
###########################################################################
613
### Ordinary sections
616
sub process_section_file ( $$$ )
617
{ my ($file, $depth, $nodetitle) = check_args(3, @_);
618
my $he = file_to_tree(($file =~ /^\//) ? $file : $html_directory . $file);
620
# print STDERR "process_section_file: $file $depth $nodetitle\n";
623
# while ($depth >= scalar(@section_stack)) { pop(@section_stack); }
624
@section_stack = @section_stack[0..$depth-1];
626
# Not a great nodename fixup scheme; need a more global view
627
if ((defined $contents_fixups{$nodetitle})
628
&& (scalar(@section_stack) > 0))
629
{ my $up_title = $section_stack[$#section_stack];
630
# hack for Python Standard Library
631
$up_title =~ s/^(Built-in|Standard) Module //g;
632
my ($up_first_word) = split(/ /, $up_title);
633
$nodetitle = "$up_first_word $nodetitle";
636
push @section_stack, $nodetitle;
637
# print STDERR "new section_stack: ", join(", ", @section_stack), "\n";
639
$he->traverse(\&process_if_child_links, 'ignore text');
642
$he->traverse(\&process_if_footnotes, 'ignore text');
646
if (exists $file_index_entries{$file})
647
{ %this_index_entries = %{$file_index_entries{$file}};
648
# print STDERR "this_index_entries:\n ", join("\n ", keys %this_index_entries), "\n";
651
{ # print STDERR "Warning: no index entries for file $file\n";
652
%this_index_entries = (); }
654
if (exists $file_index_entries_broken{$file})
655
{ @this_index_entries_broken = @{$file_index_entries_broken{$file}}; }
657
{ # print STDERR "Warning: no index entries for file $file\n";
658
@this_index_entries_broken = (); }
661
if ($he->tag() ne "html")
662
{ die "Expected <HTML> at top level"; }
663
my @content = @{$he->content()};
664
if ((!ref $content[0]) or ($content[0]->tag ne "head"))
666
die "<HEAD> not first element of <HTML>"; }
667
if ((!ref $content[1]) or ($content[1]->tag ne "body"))
669
die "<BODY> not second element of <HTML>"; }
671
$content[1]->traverse(\&output_body);
674
# stack of things we're inside that are preventing indexing from occurring now.
675
# These are "h1", "h2", "h3", "h4", "h5", "h6", "dt" (and possibly others?)
676
my @index_deferrers = ();
678
sub push_or_pop_index_deferrers ( $$ )
679
{ my ($tag, $startflag) = check_args(2, @_);
681
{ push @index_deferrers, $tag; }
683
{ my $old_deferrer = pop @index_deferrers;
684
if ($tag ne $old_deferrer)
685
{ die "Expected $tag at top of index_deferrers but saw $old_deferrer; remainder = ", join(" ", @index_deferrers); }
686
do_deferred_index_entries(); }
690
sub label_add_index_entries ( $;$ )
691
{ my ($label, $he) = check_args_range(1, 2, @_);
692
# print ((exists $this_index_entries{$label}) ? "*" : " "), " label_add_index_entries $label\n";
693
# $he is the anchor element
694
if (exists $this_index_entries{$label})
695
{ push @deferred_index_entries, @{$this_index_entries{$label}};
698
if ($label eq $l2h_broken_link_name)
699
{ # Try to find some text to use in guessing which links should point here
700
# I should probably only look at the previous element, or if that is
701
# all punctuation, the one before it; collecting all the previous texts
702
# is a bit of overkill.
703
my @anchor_texts = collect_texts($he);
704
my @previous_texts = collect_texts($he->parent, $he);
705
# 4 elements is arbitrary; ought to filter out punctuation and small words
706
# first, then perhaps keep fewer. Perhaps also filter out formatting so
707
# that we can see a larger chunk of text? (Probably not.)
708
# Also perhaps should do further chunking into words, in case the
709
# index term isn't a chunk of its own (eg, was in <tt>...</tt>.
710
my @candidate_texts = (@anchor_texts, (reverse(@previous_texts))[0..min(3,$#previous_texts)]);
713
for my $text (@candidate_texts)
714
{ # my $orig_text = $text;
715
if ($text =~ /^[\"\`\'().?! ]*$/)
717
if (length($text) <= 2)
719
# hack for Python manual; maybe defer until failure first time around?
720
$text =~ s/^sys\.//g;
721
for my $iterm (@this_index_entries_broken)
722
{ # I could test for zero: LaTeX2HTML's failures in the Python
723
# documentation are only for items of the form "... (built-in...)"
724
if (index($iterm, $text) != -1)
725
{ push @deferred_index_entries, $iterm;
726
# print STDERR "Guessing index term `$iterm' for text `$orig_text'\n";
730
{ # print STDERR "No guess in `", join("'; `", @this_index_entries_broken), "' for texts:\n `", join("'\n `", @candidate_texts), "'\n";
736
# Need to add calls to this at various places.
737
# Perhaps add HTML::Element argument and do the check for appropriateness
738
# here (ie, no action if inside <H1>, etc.).
739
sub do_deferred_index_entries ()
741
if ((scalar(@deferred_index_entries) > 0)
742
&& (scalar(@index_deferrers) == 0))
743
{ print TEXI "\n", join("\n", @deferred_index_entries), "\n";
744
@deferred_index_entries = (); }
747
my $table_columns; # undefined if not in a table
748
my $table_first_column; # boolean
750
sub output_body ( $$$ )
751
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
754
{ my $space_index = index($he, " ");
755
if ($space_index != -1)
757
# print TEXI texi_quote(substr($he, 0, $space_index+1));
758
# give: Can't locate object method "TEXI" via package "texi_quote"
759
# (Because the definition texi_quote hasn't been seen yet.)
760
print TEXI &texi_quote(substr($he, 0, $space_index+1));
761
do_deferred_index_entries();
762
print TEXI &texi_quote(substr($he, $space_index+1)); }
764
{ print TEXI &texi_quote($he); }
767
my $tag = $he->tag();
769
# Ordinary text markup first
770
if (exists $inline_markup{$tag})
772
{ print TEXI "\@$inline_markup{$tag}\{"; }
774
{ print TEXI "\}"; } }
776
{ my ($name, $href, @content) = anchor_info($he);
778
{ # This anchor is only here for indexing/cross referencing purposes.
780
{ label_add_index_entries($name, $he); }
782
elsif ($href =~ "^(ftp|http|news):")
784
{ # Should avoid second argument if it's identical to the URL.
785
print TEXI "\@uref\{$href, "; }
789
elsif ($href =~ /^\#(foot[0-9]+)$/)
792
{ # Could double-check name and content, but I'm not
793
# currently storing that information.
794
print TEXI "\@footnote\{";
795
$footnotes{$1}->traverse(\&output_body);
800
{ # cross-references are not active Info links, but no text is lost
801
print STDERR "Can't deal with internal HREF anchors yet:\n";
806
{ print TEXI "\@\n"; }
807
elsif ($tag eq "body")
809
elsif ($tag eq "center")
810
{ if (has_single_content_string($he)
811
&& ($ {$he->content}[0] =~ /^ *$/))
814
{ print TEXI "\n\@center\n"; }
816
{ print TEXI "\n\@end center\n"; }
818
elsif ($tag eq "div")
819
{ my $align = $he->attr('align');
820
if (defined($align) && ($align eq "center"))
821
{ if (has_single_content_string($he)
822
&& ($ {$he->content}[0] =~ /^ *$/))
825
{ print TEXI "\n\@center\n"; }
827
{ print TEXI "\n\@end center\n"; } }
830
{ # Recognize "<dl><dd><pre> ... </pre></dl>" paradigm for "@example"
831
if (has_single_content_with_tag($he, "dd"))
832
{ my $he_dd = $ {$he->content}[0];
833
if (has_single_content_with_tag($he_dd, "pre"))
834
{ my $he_pre = $ {$he_dd->content}[0];
838
{ # Could examine the elements, to be cleverer about formatting.
839
# (Also to use ftable, vtable...)
840
print TEXI "\n\@table \@asis\n"; }
842
{ print TEXI "\n\@end table\n"; }
845
{ push_or_pop_index_deferrers($tag, $startflag);
847
{ print TEXI "\n\@item "; }
855
if (scalar(@index_deferrers) != 0)
857
die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }
858
do_deferred_index_entries();
860
elsif ($tag =~ /^(font|big|small)$/)
861
{ # Do nothing for now.
863
elsif ($tag =~ /^h[1-6]$/)
864
{ # We don't need this because we never recursively enter the heading content.
865
# push_or_pop_index_deferrers($tag, $startflag);
868
for my $elt (@{$he->content})
870
{ $secname .= $elt; }
871
elsif ($elt->tag eq "br")
873
elsif ($elt->tag eq "a")
874
{ my ($name, $href, @acontent) = anchor_info($elt);
878
die "Nonsimple anchor in <$tag>"; }
880
{ die "No NAME for anchor in $tag"; }
881
push @seclabels, $name;
882
for my $subelt (@acontent)
883
{ $secname .= html_to_texi($subelt); } }
885
{ $secname .= html_to_texi($elt); } }
887
{ die "No section name in <$tag>"; }
888
if (scalar(@section_stack) == 1)
889
{ if ($section_stack[-1] ne "Top")
890
{ die "Not top? $section_stack[-1]"; }
891
print TEXI "\@settitle $secname\n";
892
print TEXI "\@c %**end of header\n";
894
print TEXI "\@node Top\n";
897
{ print TEXI "\n\@node $section_stack[-1]\n";
898
print TEXI "\@$sectionmarker[scalar(@section_stack)-1] ", texi_remove_punctuation($secname), "\n"; }
899
for my $seclabel (@seclabels)
900
{ label_add_index_entries($seclabel); }
901
# This should only happen once per file.
902
label_add_index_entries("");
903
if (scalar(@index_deferrers) != 0)
905
die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }
906
do_deferred_index_entries();
911
elsif ($tag eq "ignore")
912
{ # Hack for ignored elements
917
{ print TEXI "\n\n\@item\n";
918
do_deferred_index_entries(); } }
921
{ print TEXI "\n\@enumerate \@bullet\n"; }
923
{ print TEXI "\n\@end enumerate\n"; } }
926
{ print TEXI "\n\n"; }
927
if (scalar(@index_deferrers) != 0)
929
die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }
930
do_deferred_index_entries(); }
931
elsif ($tag eq "pre")
934
elsif ($tag eq "table")
935
{ # Could also indicate common formatting for first column, or
936
# determine relative widths for columns (or determine a prototype row)
938
{ if (defined $table_columns)
940
die "Can't deal with table nested inside $table_columns-column table"; }
941
$table_columns = table_columns($he);
942
if ($table_columns < 2)
944
die "Column with $table_columns columns?"; }
945
elsif ($table_columns == 2)
946
{ print TEXI "\n\@table \@asis\n"; }
948
{ print TEXI "\n\@multitable \@columnfractions";
949
for (my $i=0; $i<$table_columns; $i++)
950
{ print TEXI " ", 1.0/$table_columns; }
953
{ if ($table_columns == 2)
954
{ print TEXI "\n\@end table\n"; }
956
{ print TEXI "\n\@end multitable\n"; }
957
undef $table_columns; } }
958
elsif (($tag eq "td") || ($tag eq "th"))
960
{ if ($table_first_column)
961
{ print TEXI "\n\@item ";
962
$table_first_column = 0; }
963
elsif ($table_columns > 2)
964
{ print TEXI "\n\@tab "; } }
966
{ print TEXI "\n"; } }
969
{ $table_first_column = 1; } }
972
{ print TEXI "\n\@itemize \@bullet\n"; }
974
{ print TEXI "\n\@end itemize\n"; } }
976
{ # I used to have a newline before "output_body" here.
977
print STDERR "output_body: ignoring <$tag> tag\n";
985
{ my ($he_pre) = check_args(1, @_);
986
if (!has_single_content_string($he_pre))
987
{ die "Multiple or non-string content for <PRE>: ", @{$he_pre->content}; }
988
my $pre_content = $ {$he_pre->content}[0];
989
print TEXI "\n\@example";
990
print TEXI &texi_quote($pre_content);
991
print TEXI "\@end example\n";
994
sub table_columns ( $ )
995
{ my ($table) = check_args(1, @_);
997
for my $row (@{$table->content})
998
{ if ($row->tag ne "tr")
1001
die "Expected <TR> as table row."; }
1002
$result = max($result, scalar(@{$row->content})); }
1007
###########################################################################
1012
{ my ($x, $y) = check_args(2, @_);
1013
return ($x < $y) ? $x : $y;
1017
{ my ($x, $y) = check_args(2, @_);
1018
return ($x > $y) ? $x : $y;
1021
sub file_to_tree ( $ )
1022
{ my ($file) = check_args(1, @_);
1024
my $tree = new HTML::TreeBuilder;
1025
$tree->ignore_unknown(1);
1027
$tree->parse_file($file);
1028
cleanup_parse_tree($tree);
1033
sub has_single_content ( $ )
1034
{ my ($he) = check_args(1, @_);
1037
die "Non-reference argument: $he"; }
1038
my $ref_content = $he->content;
1039
if (!defined $ref_content)
1041
my @content = @{$ref_content};
1042
if (scalar(@content) != 1)
1048
# Return true if the content of the element contains only one element itself,
1049
# and that inner element has the specified tag.
1050
sub has_single_content_with_tag ( $$ )
1051
{ my ($he, $tag) = check_args(2, @_);
1052
if (!has_single_content($he))
1054
my $content = $ {$he->content}[0];
1057
my $content_tag = $content->tag;
1058
if (!defined $content_tag)
1060
return $content_tag eq $tag;
1063
sub has_single_content_string ( $ )
1064
{ my ($he) = check_args(1, @_);
1065
if (!has_single_content($he))
1067
my $content = $ {$he->content}[0];
1074
# Return name, href, content. First two may be undefined; third is an array.
1075
# I don't see how to determine if there are more attributes.
1076
sub anchor_info ( $ )
1077
{ my ($he) = check_args(1, @_);
1078
if ($he->tag ne "a")
1080
die "passed non-anchor to anchor_info"; }
1081
my $name = $he->attr('name');
1082
my $href = $he->attr('href');
1084
{ my $ref_content = $he->content;
1085
if (defined $ref_content)
1086
{ @content = @{$ref_content}; } }
1087
return ($name, $href, @content);
1091
sub texi_quote ( $ )
1092
{ my ($text) = check_args(1, @_);
1093
$text =~ s/([\@\{\}])/\@$1/g;
1094
$text =~ s/ -- / --- /g;
1098
# Eliminate bad punctuation (that confuses Makeinfo or Info) for section titles.
1099
sub texi_remove_punctuation ( $ )
1100
{ my ($text) = check_args(1, @_);
1103
$text =~ s/[ :]+$//g;
1104
$text =~ s/^[1-9][0-9.]* +//g;
1106
# Both embedded colons and " -- " confuse makeinfo. (Perhaps " -- "
1107
# gets converted into " - ", just as "---" would be converted into " -- ",
1108
# so the names end up differing.)
1109
# $text =~ s/:/ -- /g;
1115
## Do not use this inside `traverse': it throws off the traversal. Use
1116
## html_replace_by_ignore or html_replace_by_meta instead.
1117
# Returns 1 if success, 0 if failure.
1118
sub html_remove ( $;$ )
1119
{ my ($he, $parent) = check_args_range(1, 2, @_);
1120
if (!defined $parent)
1121
{ $parent = $he->parent; }
1122
my $ref_pcontent = $parent->content;
1123
my @pcontent = @{$ref_pcontent};
1124
for (my $i=0; $i<scalar(@pcontent); $i++)
1125
{ if ($pcontent[$i] eq $he)
1126
{ splice @{$ref_pcontent}, $i, 1;
1129
die "Didn't find $he in $parent";
1133
sub html_replace ( $$;$ )
1134
{ my ($orig, $new, $parent) = check_args_range(2, 3, @_);
1135
if (!defined $parent)
1136
{ $parent = $orig->parent; }
1137
my $ref_pcontent = $parent->content;
1138
my @pcontent = @{$ref_pcontent};
1139
for (my $i=0; $i<scalar(@pcontent); $i++)
1140
{ if ($pcontent[$i] eq $orig)
1141
{ $ {$ref_pcontent}[$i] = $new;
1142
$new->parent($parent);
1143
$orig->parent(undef);
1145
die "Didn't find $orig in $parent";
1148
sub html_replace_by_meta ( $;$ )
1149
{ my ($orig, $parent) = check_args_range(1, 2, @_);
1150
my $meta = new HTML::Element "meta";
1151
if (!defined $parent)
1152
{ $parent = $orig->parent; }
1153
return html_replace($orig, $meta, $parent);
1156
sub html_replace_by_ignore ( $;$ )
1157
{ my ($orig, $parent) = check_args_range(1, 2, @_);
1158
my $ignore = new HTML::Element "ignore";
1159
if (!defined $parent)
1160
{ $parent = $orig->parent; }
1161
return html_replace($orig, $ignore, $parent);
1167
### Collect text elements
1170
my @collected_texts;
1171
my $collect_texts_stoppoint;
1172
my $done_collecting;
1174
sub collect_texts ( $;$ )
1175
{ my ($root, $stop) = check_args_range(1, 2, @_);
1176
# print STDERR "collect_texts: $root $stop\n";
1177
$collect_texts_stoppoint = $stop;
1178
$done_collecting = 0;
1179
@collected_texts = ();
1180
$root->traverse(\&collect_if_text); # process texts
1181
# print STDERR "collect_texts => ", join(";;;", @collected_texts), "\n";
1182
return @collected_texts;
1185
sub collect_if_text ( $$$ )
1186
{ my $he = (check_args(3, @_))[0]; # ignore depth and startflag arguments
1187
if ($done_collecting)
1192
{ push @collected_texts, $he;
1194
if ((defined $collect_texts_stoppoint) && ($he eq $collect_texts_stoppoint))
1195
{ $done_collecting = 1;
1201
###########################################################################
1202
### Clean up parse tree
1205
sub cleanup_parse_tree ( $ )
1206
{ my ($he) = check_args(1, @_);
1207
$he->traverse(\&delete_if_navigation, 'ignore text');
1208
$he->traverse(\&delete_extra_spaces, 'ignore text');
1209
$he->traverse(\&merge_dl, 'ignore text');
1210
$he->traverse(\&reorder_dt_and_dl, 'ignore text');
1215
## Simpler version that deletes contents but not the element itself.
1216
# sub delete_if_navigation ( $$$ )
1217
# { my $he = (check_args(3, @_))[0]; # ignore startflag and depth
1218
# if (($he->tag() eq "div") && ($he->attr('class') eq 'navigation'))
1225
sub delete_if_navigation ( $$$ )
1226
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
1230
if (($he->tag() eq "div") && (defined $he->attr('class')) && ($he->attr('class') eq 'navigation'))
1231
{ my $ref_pcontent = $he->parent()->content();
1232
# Don't try to modify @pcontent, which appears to be a COPY.
1233
# my @pcontent = @{$ref_pcontent};
1234
for (my $i = 0; $i<scalar(@{$ref_pcontent}); $i++)
1235
{ if (${$ref_pcontent}[$i] eq $he)
1236
{ splice(@{$ref_pcontent}, $i, 1);
1244
sub delete_extra_spaces ( $$$ )
1245
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
1250
if ($tag =~ /^(head|html|table|tr|ul)$/)
1251
{ delete_child_spaces($he); }
1252
delete_trailing_spaces($he);
1257
sub delete_child_spaces ( $ )
1258
{ my ($he) = check_args(1, @_);
1259
my $ref_content = $he->content();
1260
for (my $i = 0; $i<scalar(@{$ref_content}); $i++)
1261
{ if ($ {$ref_content}[$i] =~ /^ *$/)
1262
{ splice(@{$ref_content}, $i, 1);
1266
sub delete_trailing_spaces ( $ )
1267
{ my ($he) = check_args(1, @_);
1268
my $ref_content = $he->content();
1269
if (! defined $ref_content)
1271
# Could also check for previous element = /^h[1-6]$/.
1272
for (my $i = 0; $i<scalar(@{$ref_content})-1; $i++)
1273
{ if ($ {$ref_content}[$i] =~ /^ *$/)
1274
{ my $next_elt = $ {$ref_content}[$i+1];
1275
if ((ref $next_elt) && ($next_elt->tag =~ /^(br|dd|dl|dt|hr|p|ul)$/))
1276
{ splice(@{$ref_content}, $i, 1);
1278
if ($he->tag =~ /^(dd|dt|^h[1-6]|li|p)$/)
1279
{ my $last_elt = $ {$ref_content}[$#{$ref_content}];
1280
if ((defined $last_elt) && ($last_elt =~ /^ *$/))
1281
{ pop @{$ref_content}; } }
1285
# LaTeX2HTML sometimes creates
1287
# <DL COMPACT><DD>text
1288
# which should actually be:
1292
# Since a <DL> gets added, this ends up looking like
1302
# which should become
1312
sub reorder_dt_and_dl ( $$$ )
1313
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
1317
if ($he->tag() eq "p")
1318
{ my $ref_pcontent = $he->content();
1319
if (defined $ref_pcontent)
1320
{ my @pcontent = @{$ref_pcontent};
1321
# print "reorder_dt_and_dl found a <p>\n"; $he->dump();
1322
if ((scalar(@pcontent) >= 1)
1323
&& (ref $pcontent[0]) && ($pcontent[0]->tag() eq "dl")
1324
&& $pcontent[0]->implicit())
1325
{ my $ref_dlcontent = $pcontent[0]->content();
1326
# print "reorder_dt_and_dl found a <p> and implicit <dl>\n";
1327
if (defined $ref_dlcontent)
1328
{ my @dlcontent = @{$ref_dlcontent};
1329
if ((scalar(@dlcontent) >= 1)
1330
&& (ref $dlcontent[0]) && ($dlcontent[0]->tag() eq "dt"))
1331
{ my $ref_dtcontent = $dlcontent[0]->content();
1332
# print "reorder_dt_and_dl found a <p>, implicit <dl>, and <dt>\n";
1333
if (defined $ref_dtcontent)
1334
{ my @dtcontent = @{$ref_dtcontent};
1335
if ((scalar(@dtcontent) > 0)
1336
&& (ref $dtcontent[$#dtcontent])
1337
&& ($dtcontent[$#dtcontent]->tag() eq "dl"))
1338
{ my $ref_dl2content = $dtcontent[$#dtcontent]->content();
1339
# print "reorder_dt_and_dl found a <p>, implicit <dl>, <dt>, and <dl>\n";
1340
if (defined $ref_dl2content)
1341
{ my @dl2content = @{$ref_dl2content};
1342
if ((scalar(@dl2content) > 0)
1343
&& (ref ($dl2content[0]))
1344
&& ($dl2content[0]->tag() eq "dd"))
1346
# print "reorder_dt_and_dl found a <p>, implicit <dl>, <dt>, <dl>, and <dd>\n";
1347
# print STDERR "CHANGING\n"; $he->dump();
1348
html_replace_by_ignore($dtcontent[$#dtcontent]);
1349
splice(@{$ref_dlcontent}, 1, 0, @dl2content);
1350
# print STDERR "CHANGED TO:\n"; $he->dump();
1351
return 0; # don't traverse children
1357
# If we find a paragraph that looks like
1361
# then accumulate its links into a contents_list and delete the paragraph.
1362
sub process_if_child_links ( $$$ )
1363
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
1367
if ($he->tag() eq "p")
1368
{ my $ref_content = $he->content();
1369
if (defined $ref_content)
1370
{ my @content = @{$ref_content};
1371
if ((scalar(@content) == 2)
1372
&& (ref $content[0]) && $content[0]->tag() eq "hr"
1373
&& (ref $content[1]) && $content[1]->tag() eq "ul")
1374
{ process_child_links($he);
1386
# <A NAME="foot560">
1388
# <A HREF="refcountsInPython.html#tex2html2" NAME="foot560">
1391
# "The metaphor of ``borrowing'' a reference is not completely correct: the owner still has a copy of the reference. "
1393
# then record the footnote information and delete the section and list.
1395
my $process_if_footnotes_expect_dl_next = 0;
1397
sub process_if_footnotes ( $$$ )
1398
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
1402
if (($he->tag() eq "h4")
1403
&& has_single_content_string($he)
1404
&& ($ {$he->content}[0] eq "Footnotes"))
1405
{ html_replace_by_ignore($he);
1406
$process_if_footnotes_expect_dl_next = 1;
1409
if ($process_if_footnotes_expect_dl_next && ($he->tag() eq "dl"))
1410
{ my $ref_content = $he->content();
1411
if (defined $ref_content)
1412
{ $process_if_footnotes_expect_dl_next = 0;
1413
my @content = @{$ref_content};
1414
for (my $i=0; $i<$#content; $i+=2)
1415
{ my $he_dt = $content[$i];
1416
my $he_dd = $content[$i+1];
1417
if (($he_dt->tag ne "dt") || ($he_dd->tag ne "dd"))
1419
die "expected <DT> and <DD> at positions $i and ", $i+1; }
1420
my @dt_content = @{$he_dt->content()};
1421
if ((scalar(@dt_content) != 2)
1422
|| ($dt_content[0]->tag ne "a")
1423
|| ($dt_content[1]->tag ne "a"))
1425
die "Expected 2 anchors as content of <DT>"; }
1426
my ($dt1_name, $dt1_href, $dt1_content) = anchor_info($dt_content[0]);
1427
my ($dt2_name, $dt2_href, $dt2_content) = anchor_info($dt_content[0]);
1428
# unused: $dt1_href, $dt1_content, $dt2_href, $dt2_content
1429
if ($dt1_name ne $dt2_name)
1431
die "Expected identical names for anchors"; }
1432
html_replace_by_ignore($he_dd);
1433
$he_dd->tag("div"); # has no effect
1434
$footnotes{$dt1_name} = $he_dd; }
1435
html_replace_by_ignore($he);
1438
if ($process_if_footnotes_expect_dl_next)
1440
die "Expected <DL> for footnotes next"; }
1447
## Merge two adjacent paragraphs containing <DL> items, such as:
1461
sub merge_dl ( $$$ )
1462
{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
1466
my $ref_content = $he->content;
1467
if (!defined $ref_content)
1470
while ($i < scalar(@{$ref_content})-1)
1471
{ my $p1 = $ {$ref_content}[$i];
1472
if ((ref $p1) && ($p1->tag eq "p")
1473
&& has_single_content_with_tag($p1, "dl"))
1474
{ my $dl1 = $ {$p1->content}[0];
1475
# In this loop, rhs, not lhs, of < comparison changes,
1476
# because we are removing elements from the content of $he.
1477
while ($i < scalar(@{$ref_content})-1)
1478
{ my $p2 = $ {$ref_content}[$i+1];
1479
if (!((ref $p2) && ($p2->tag eq "p")
1480
&& has_single_content_with_tag($p2, "dl")))
1482
# Merge these two elements.
1483
splice(@{$ref_content}, $i+1, 1); # remove $p2
1484
my $dl2 = $ {$p2->content}[0];
1485
$dl1->push_content(@{$dl2->content}); # put $dl2's content in $dl1
1487
# extra increment because next element isn't a candidate for $p1
1495
###########################################################################
1500
{ my ($action, $file) = check_args(2, @_);
1503
if (($action eq "view") || ($action eq ""))
1504
{ # # $file = "/homes/gws/mernst/www/links.html";
1505
# # $file = "/homes/gws/mernst/www/index.html";
1506
# # $file = "/homes/fish/mernst/java/gud/doc/manual.html";
1507
# # $file = "/projects/cecil/cecil/doc/manuals/stdlib-man/stdlib/stdlib.html";
1508
# # $file = "/homes/fish/mernst/tmp/python-doc/html/index.html";
1509
# $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html";
1510
my $tree = file_to_tree($file);
1513
# print STDERR $tree->as_HTML;
1516
# print STDERR $tree->tag(), "\n";
1517
# print STDERR @{$tree->content()}, "\n";
1519
# for (@{ $tree->extract_links(qw(a img)) }) {
1520
# my ($link, $linkelem) = @$_;
1521
# print STDERR "$link ", $linkelem->as_HTML;
1524
# print STDERR @{$tree->extract_links()}, "\n";
1526
# my @top_level_elts = @{$tree->content()};
1528
# if scalar(@{$tree->content()})
1532
elsif ($action eq "raw")
1533
{ my $tree = new HTML::TreeBuilder;
1534
$tree->ignore_unknown(1);
1536
$tree->parse_file($file);
1540
# cleanup_parse_tree($tree);
1545
# Test dealing with a section.
1546
elsif ($action eq "section")
1548
# $file = "/homes/fish/mernst/tmp/python-doc/html/api/intro.html";
1549
# $file = "/homes/fish/mernst/tmp/python-doc/html/api/includes.html";
1550
# $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html";
1551
process_section_file($file, 0, "Title");
1554
# Test dealing with many sections
1556
{ my @files = ("/homes/fish/mernst/tmp/python-doc/html/api/about.html",
1557
"/homes/fish/mernst/tmp/python-doc/html/api/abstract.html",
1558
"/homes/fish/mernst/tmp/python-doc/html/api/api.html",
1559
"/homes/fish/mernst/tmp/python-doc/html/api/cObjects.html",
1560
"/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html",
1561
"/homes/fish/mernst/tmp/python-doc/html/api/concrete.html",
1562
# "/homes/fish/mernst/tmp/python-doc/html/api/contents.html",
1563
"/homes/fish/mernst/tmp/python-doc/html/api/countingRefs.html",
1564
"/homes/fish/mernst/tmp/python-doc/html/api/debugging.html",
1565
"/homes/fish/mernst/tmp/python-doc/html/api/dictObjects.html",
1566
"/homes/fish/mernst/tmp/python-doc/html/api/embedding.html",
1567
"/homes/fish/mernst/tmp/python-doc/html/api/exceptionHandling.html",
1568
"/homes/fish/mernst/tmp/python-doc/html/api/exceptions.html",
1569
"/homes/fish/mernst/tmp/python-doc/html/api/fileObjects.html",
1570
"/homes/fish/mernst/tmp/python-doc/html/api/floatObjects.html",
1571
"/homes/fish/mernst/tmp/python-doc/html/api/front.html",
1572
"/homes/fish/mernst/tmp/python-doc/html/api/fundamental.html",
1573
# "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html",
1574
"/homes/fish/mernst/tmp/python-doc/html/api/importing.html",
1575
"/homes/fish/mernst/tmp/python-doc/html/api/includes.html",
1576
"/homes/fish/mernst/tmp/python-doc/html/api/index.html",
1577
"/homes/fish/mernst/tmp/python-doc/html/api/initialization.html",
1578
"/homes/fish/mernst/tmp/python-doc/html/api/intObjects.html",
1579
"/homes/fish/mernst/tmp/python-doc/html/api/intro.html",
1580
"/homes/fish/mernst/tmp/python-doc/html/api/listObjects.html",
1581
"/homes/fish/mernst/tmp/python-doc/html/api/longObjects.html",
1582
"/homes/fish/mernst/tmp/python-doc/html/api/mapObjects.html",
1583
"/homes/fish/mernst/tmp/python-doc/html/api/mapping.html",
1584
"/homes/fish/mernst/tmp/python-doc/html/api/newTypes.html",
1585
"/homes/fish/mernst/tmp/python-doc/html/api/node24.html",
1586
"/homes/fish/mernst/tmp/python-doc/html/api/noneObject.html",
1587
"/homes/fish/mernst/tmp/python-doc/html/api/number.html",
1588
"/homes/fish/mernst/tmp/python-doc/html/api/numericObjects.html",
1589
"/homes/fish/mernst/tmp/python-doc/html/api/object.html",
1590
"/homes/fish/mernst/tmp/python-doc/html/api/objects.html",
1591
"/homes/fish/mernst/tmp/python-doc/html/api/os.html",
1592
"/homes/fish/mernst/tmp/python-doc/html/api/otherObjects.html",
1593
"/homes/fish/mernst/tmp/python-doc/html/api/processControl.html",
1594
"/homes/fish/mernst/tmp/python-doc/html/api/refcountDetails.html",
1595
"/homes/fish/mernst/tmp/python-doc/html/api/refcounts.html",
1596
"/homes/fish/mernst/tmp/python-doc/html/api/sequence.html",
1597
"/homes/fish/mernst/tmp/python-doc/html/api/sequenceObjects.html",
1598
"/homes/fish/mernst/tmp/python-doc/html/api/standardExceptions.html",
1599
"/homes/fish/mernst/tmp/python-doc/html/api/stringObjects.html",
1600
"/homes/fish/mernst/tmp/python-doc/html/api/threads.html",
1601
"/homes/fish/mernst/tmp/python-doc/html/api/tupleObjects.html",
1602
"/homes/fish/mernst/tmp/python-doc/html/api/typeObjects.html",
1603
"/homes/fish/mernst/tmp/python-doc/html/api/types.html",
1604
"/homes/fish/mernst/tmp/python-doc/html/api/utilities.html",
1605
"/homes/fish/mernst/tmp/python-doc/html/api/veryhigh.html");
1606
for my $file (@files)
1607
{ print STDERR "\n", "=" x 75, "\n", "$file:\n";
1608
process_section_file($file, 0, "Title");
1612
# Test dealing with index.
1613
elsif ($action eq "index")
1615
# $file = "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html";
1617
process_index_file($file, "\@cindex");
1622
{ die "Unrecognized action `$action'"; }
1626
###########################################################################
1630
sub process_contents_file ( $ )
1631
{ my ($file) = check_args(1, @_);
1633
# could also use File::Basename
1634
my $info_file = $file;
1635
$info_file =~ s/(\/?index)?\.html$//;
1636
if ($info_file eq "")
1637
{ chomp($info_file = `pwd`); }
1638
$info_file =~ s/^.*\///; # not the most efficient way to remove dirs
1640
$html_directory = $file;
1641
$html_directory =~ s/(\/|^)[^\/]+$/$1/;
1643
my $texi_file = "$info_file.texi";
1644
open(TEXI, ">$texi_file");
1646
print TEXI "\\input texinfo \@c -*-texinfo-*-\n";
1647
print TEXI "\@c %**start of header\n";
1648
print TEXI "\@setfilename $info_file\n";
1650
# 2. Summary Description and Copyright
1651
# The "Summary Description and Copyright" segment describes the
1652
# document and contains the copyright notice and copying permissions
1653
# for the Info file. The segment must be enclosed between `@ifinfo'
1654
# and `@end ifinfo' commands so that the formatters place it only in
1657
# The summary description and copyright segment does not appear in the
1661
# This is a short example of a complete Texinfo file.
1663
# Copyright @copyright{} 1990 Free Software Foundation, Inc.
1667
# 3. Title and Copyright
1668
# The "Title and Copyright" segment contains the title and copyright
1669
# pages and copying permissions for the printed manual. The segment
1670
# must be enclosed between `@titlepage' and `@end titlepage'
1671
# commands. The title and copyright page appear only in the printed
1674
# The titlepage segment does not appear in the Info file.
1678
# @comment The title is printed in a large font.
1679
# @center @titlefont{Sample Title}
1681
# @c The following two commands start the copyright page.
1683
# @vskip 0pt plus 1filll
1684
# Copyright @copyright{} 1990 Free Software Foundation, Inc.
1688
# 4. `Top' Node and Master Menu
1689
# The "Master Menu" contains a complete menu of all the nodes in the
1690
# whole Info file. It appears only in the Info file, in the `Top'
1693
# The `Top' node contains the master menu for the Info file. Since a
1694
# printed manual uses a table of contents rather than a menu, the master
1695
# menu appears only in the Info file.
1697
# @node Top, First Chapter, , (dir)
1698
# @comment node-name, next, previous, up
1701
# * First Chapter:: The first chapter is the
1702
# only chapter in this sample.
1703
# * Concept Index:: This index has two entries.
1708
$current_ref_tdf = [ "Top", 0, $ARGV[0] ];
1709
process_section_file($file, 0, "Top");
1710
while (scalar(@contents_list))
1711
{ $current_ref_tdf = shift @contents_list;
1712
process_section_file($ {$current_ref_tdf}[2], $ {$current_ref_tdf}[1], $ {$current_ref_tdf}[0]);
1716
for my $indextitle (@index_titles)
1717
{ print TEXI "\@node $indextitle\n";
1718
print TEXI "\@unnumbered $indextitle\n";
1719
print TEXI "\@printindex $ {$index_info{$indextitle}}[1]\n";
1722
print TEXI "\@contents\n";
1723
print TEXI "\@bye\n";
1727
# This needs to be last so global variable initializations are reached.
1729
if (scalar(@ARGV) == 0)
1730
{ die "No arguments supplied to html2texi.pl"; }
1732
if ($ARGV[0] eq "-test")
1733
{ my @test_args = @ARGV[1..$#ARGV];
1734
if (scalar(@test_args) == 0)
1735
{ test("", "index.html"); }
1736
elsif (scalar(@test_args) == 1)
1737
{ test("", $test_args[0]); }
1738
elsif (scalar(@test_args) == 2)
1739
{ test($test_args[0], $test_args[1]); }
1741
{ die "Too many test arguments passed to html2texi: ", join(" ", @ARGV); }
1745
if (scalar(@ARGV) != 1)
1746
{ die "Pass one argument, the main/contents page"; }
1748
process_contents_file($ARGV[0]);
1750
# end of html2texi.pl