~exarkun/pyopenssl/trunk

# (progn (goto-char (point-min)) (replace-regexp "\$@setfilename \$\$[-a-z]*\$$" "\\1python-\\2.info") (replace-string "@node Front Matter\n@chapter Abstract\n" "@node Abstract\n@section Abstract\n") (progn (mark-whole-buffer) (texinfo-master-menu 'update-all-nodes)) (save-buffer))

# makeinfo api.texi

# makeinfo ext.texi

# makeinfo lib.texi

# makeinfo mac.texi

# makeinfo ref.texi

# makeinfo tut.texi

###

### Structure of the code

###

# To be written...

###

### Design decisions

###

# Source and destination languages

# --------------------------------

# The goal is Info files; I create Texinfo, so I don't have to worry about

# the finer details of Info file creation. (I'm not even sure of its exact

# format.)

# Why not start from LaTeX rather than HTML?

# I could hack latex2html itself to produce Texinfo instead, or fix up

# partparse.py (which already translates LaTeX to Teinfo).

# Pros:

# * has high-level information such as index entries, original formatting

# Cons:

100

# * those programs are complicated to read and understand

101

# * those programs try to handle arbitrary LaTeX input, track catcodes,

102

# and more: I don't want to go to that effort. HTML isn't as powerful

103

# as LaTeX, so there are fewer subtleties.

104

# * the result wouldn't work for arbitrary HTML documents; it would be

105

# nice to eventually extend this program to HTML produced from Docbook,

106

# Frame, and more.

107

108

# Parsing

109

# -------

110

111

# I don't want to view the text as a linear stream; I'd rather parse the

112

# whole thing and then do pattern matching over the parsed representation (to

113

# find idioms such as indices, lists of child nodes, etc.).

114

# * Perl provides HTML::TreeBuilder, which does just what I want.

115

# * libwww-perl: http://www.linpro.no/lwp/

116

# * TreeBuilder: HTML-Tree-0.51.tar.gz

117

# * Python Parsers, Formatters, and Writers don't really provide the right

118

# interface (and the version in Grail doesn't correspond to another

119

# distributed version, so I'm confused about which to be using). I could

120

# write something in Python that creates a parse tree, but why bother?

121

122

# Other implementation language issues:

123

# * Python lacks variable declarations, reasonable scoping, and static

124

# checking tools. I've written some of the latter for myself that make

125

# my Perl programming a lot safer than my Python programming will be until

126

# I have a similar suite for that language.

127

128

129

###########################################################################

130

### To do

131

###

132

133

# Section names:

134

# Fix the problem with multiple sections in a single file (eg, Abstract in

135

# Front Matter section).

136

# Deal with cross-references, as in /homes/fish/mernst/tmp/python-doc/html/ref/types.html:310

137

# Index:

138

# Perhaps double-check that every tag mentioned in the index is found

139

# in the text.

140

# Python: email to python-docs@python.org, to get their feedback.

141

# Compare to existing lib/ Info manual

142

# Write the hooks into info-look; replace pyliblookup1-1.tar.gz.

143

# Postpass to remove extra quotation marks around typography already in

144

# a different font (to avoid double delimiters as in "`code'"); or

145

# perhaps consider using only font-based markup so that we don't get

146

# the extra *bold* and `code' markup in Info.

147

148

## Perhaps don't rely on automatic means for adding up, next, prev; I have

149

## all that info available to me already, so it's not so much trouble to

150

## add it. (Right?) But it is *so* easy to use Emacs instead...

151

152

153

###########################################################################

154

### Strictures

155

###

156

157

# man HTML::TreeBuilder

158

# man HTML::Parser

159

# man HTML::Element

160

161

# require HTML::ParserWComment;

162

require HTML::Parser;

163

require HTML::TreeBuilder;

164

require HTML::Element;

165

166

use File::Basename;

167

168

use strict;

169

# use Carp;

170

171

use checkargs;

172

173

174

###########################################################################

175

### Variables

176

###

177

178

my @section_stack = (); # elements are chapter/section/subsec nodetitles (I think)

179

my $current_ref_tdf; # for the file currently being processed;

180

# used in error messages

181

my $html_directory;

182

my %footnotes;

183

184

# First element should not be used.

185

my @sectionmarker = ("manual", "chapter", "section", "subsection", "subsubsection");

186

187

my %inline_markup = ("b" => "strong",

188

"code" => "code",

189

"i" => "emph",

190

"kbd" => "kbd",

191

"samp" => "samp",

192

"strong" => "strong",

193

"tt" => "code",

194

"var" => "var");

195

196

my @deferred_index_entries = ();

197

198

my @index_titles = (); # list of (filename, type) lists

199

my %index_info = ("Index" => ["\@blindex", "bl"],

200

"Concept Index" => ["\@cindex", "cp"],

201

"Module Index" => ["\@mdindex", "md"]);

202

203

204

###########################################################################

205

### Main/contents page

206

###

207

208

# Process first-level page on its own, or just a contents page? Well, I do

209

# want the title, author, etc., and the front matter... For now, just add

210

# that by hand at the end.

211

212

213

# data structure possibilities:

214

# * tree-like (need some kind of stack when processing (or parent pointers))

215

# * list of name and depth; remember old and new depths.

216

217

# Each element is a reference to a list of (nodetitle, depth, filename).

218

my @contents_list = ();

219

220

# The problem with doing fixups on the fly is that some sections may have

221

# already been processed (and no longer available) by the time we notice

222

# others with the same name. It's probably better to fully construct the

223

# contents list (reading in all files of interest) upfront; that will also

224

# let me do a better job with cross-references, because again, all files

225

# will already be read in.

226

my %contents_hash = ();

227

my %contents_fixups = ();

228

229

my @current_contents_list = ();

230

231

# Merge @current_contents_list into @contents_list,

232

# and set @current_contents_list to be empty.

233

sub merge_contents_lists ( )

234

{ check_args(0, @_);

235

236

# Three possibilities:

237

# * @contents_list is empty: replace it by @current_contents_list.

238

# * prefixes of the two lists are identical: do nothing

239

# * @current_contents_list is all at lower level than $contents_list[0];

240

# prefix @contents_list by @current_contents_list

241

242

if (scalar(@current_contents_list) == 0)

243

{ die "empty current_contents_list"; }

244

245

# if (scalar(@contents_list) == 0)

246

# { @contents_list = @current_contents_list;

247

# @current_contents_list = ();

248

# return; }

249

250

# if (($ {$contents_list[0]}[1]) < ($ {$current_contents_list[0]}[1]))

251

# { unshift @contents_list, @current_contents_list;

252

# @current_contents_list = ();

253

# return; }

254

255

for (my $i=0; $i<scalar(@current_contents_list); $i++)

256

{ my $ref_c_tdf = $current_contents_list[$i];

257

if ($i >= scalar(@contents_list))

258

{ push @contents_list, $ref_c_tdf;

259

my $title = $ {$ref_c_tdf}[0];

260

if (defined $contents_hash{$title})

261

{ $contents_fixups{$title} = 1; }

262

else

263

{ $contents_hash{$title} = 1; }

264

next; }

265

my $ref_tdf = $contents_list[$i];

266

my ($title, $depth, $file) = @{$ref_tdf};

267

my ($c_title, $c_depth, $c_file) = @{$ref_c_tdf};

268

269

if (($title ne $c_title)

270

&& ($depth < $c_depth)

271

&& ($file ne $c_file))

272

{ splice @contents_list, $i, 0, $ref_c_tdf;

273

if (defined $contents_hash{$c_title})

274

{ $contents_fixups{$c_title} = 1; }

275

else

276

{ $contents_hash{$c_title} = 1; }

277

next; }

278

279

if (($title ne $c_title)

280

|| ($depth != $c_depth)

281

|| ($file ne $c_file))

282

{ die ("while processing $ {$current_ref_tdf}[2] at depth $ {$current_ref_tdf}[1], mismatch at index $i:",

283

"\n main: <<<$title>>> $depth $file",

284

"\n curr: <<<$c_title>>> $c_depth $c_file"); }

285

}

286

@current_contents_list = ();

287

}

288

289

290

291

# Set @current_contents_list to a list of (title, href, sectionlevel);

292

# then merge that list into @contents_list.

293

# Maybe this function should also produce a map

294

# from title (or href) to sectionlevel (eg "chapter"?).

295

sub process_child_links ( $ )

296

{ my ($he) = check_args(1, @_);

297

298

# $he->dump();

299

if (scalar(@current_contents_list) != 0)

300

{ die "current_contents_list nonempty: @current_contents_list"; }

301

$he->traverse(\&increment_current_contents_list, 'ignore text');

302

303

# Normalize the depths; for instance, convert 1,3,5 into 0,1,2.

304

my %depths = ();

305

for my $ref_tdf (@current_contents_list)

306

{ $depths{$ {$ref_tdf}[1]} = 1; }

307

my @sorted_depths = sort keys %depths;

308

my $current_depth = scalar(@section_stack)-1;

309

my $current_depth_2 = $ {$current_ref_tdf}[1];

310

if ($current_depth != $current_depth_2)

311

{ die "mismatch in current depths: $current_depth $current_depth_2; ", join(", ", @section_stack); }

312

for (my $i=0; $i<scalar(@sorted_depths); $i++)

313

{ $depths{$sorted_depths[$i]} = $i + $current_depth+1; }

314

for my $ref_tdf (@current_contents_list)

315

{ $ {$ref_tdf}[1] = $depths{$ {$ref_tdf}[1]}; }

316

317

# Eliminate uninteresting sections. Hard-coded hack for now.

318

if ($ {$current_contents_list[-1]}[0] eq "About this document ...")

319

{ pop @current_contents_list; }

320

if ((scalar(@current_contents_list) > 1)

321

&& ($ {$current_contents_list[1]}[0] eq "Contents"))

322

{ my $ref_first_tdf = shift @current_contents_list;

323

$current_contents_list[0] = $ref_first_tdf; }

324

325

for (my $i=0; $i<scalar(@current_contents_list); $i++)

326

{ my $ref_tdf = $current_contents_list[$i];

327

my $title = $ {$ref_tdf}[0];

328

if (exists $index_info{$title})

329

{ my $index_file = $ {$ref_tdf}[2];

330

my ($indexing_command, $suffix) = @{$index_info{$title}};

331

process_index_file($index_file, $indexing_command);

332

print TEXI "\n\@defindex $suffix\n";

333

push @index_titles, $title;

334

splice @current_contents_list, $i, 1;

335

$i--; }

336

elsif ($title =~ /\bIndex$/)

337

{ print STDERR "Warning: \"$title\" might be an index; if so, edit \%index_info.\n"; } }

338

339

merge_contents_lists();

340

341

# print_contents_list();

342

# print_index_info();

343

}

344

345

346

sub increment_current_contents_list ( $$$ )

347

{ my ($he, $startflag, $depth) = check_args(3, @_);

348

if (!$startflag)

349

{ return; }

350

351

if ($he->tag eq "li")

352

{ my @li_content = @{$he->content};

353

if ($li_content[0]->tag ne "a")

354

{ die "first element of <LI> should be <A>"; }

355

my ($name, $href, @content) = anchor_info($li_content[0]);

356

# unused $name

357

my $title = join("", collect_texts($li_content[0]));

358

$title = texi_remove_punctuation($title);

359

# The problem with these is that they are formatted differently in

360

# @menu and @node!

361

$title =~ s/``/\"/g;

362

$title =~ s/''/\"/g;

363

$title =~ s/ -- / /g;

364

push @current_contents_list, [ $title, $depth, $href ]; }

365

return 1;

366

}

367

368

# Simple version for section titles

369

sub html_to_texi ( $ )

370

{ my ($he) = check_args(1, @_);

371

if (!ref $he)

372

{ return $he; }

373

374

my $tag = $he->tag;

375

if (exists $inline_markup{$tag})

376

{ my $result = "\@$inline_markup{$tag}\{";

377

for my $elt (@{$he->content})

378

{ $result .= html_to_texi($elt); }

379

$result .= "\}";

380

return $result; }

381

else

382

{ $he->dump();

383

die "html_to_texi confused by <$tag>"; }

384

}

385

386

387

388

sub print_contents_list ()

389

{ check_args(0, @_);

390

print STDERR "Contents list:\n";

391

for my $ref_tdf (@contents_list)

392

{ my ($title, $depth, $file) = @{$ref_tdf};

393

print STDERR "$title $depth $file\n"; }

394

}

395

396

397

398

###########################################################################

399

### Index

400

###

401

402

my $l2h_broken_link_name = "l2h-";

403

404

405

# map from file to (map from anchor name to (list of index texts))

406

# (The list is needed when a single LaTeX command like \envvar

407

# expands to multiple \index commands.)

408

my %file_index_entries = ();

409

my %this_index_entries; # map from anchor name to (list of index texts)

410

411

my %file_index_entries_broken = (); # map from file to (list of index texts)

412

my @this_index_entries_broken;

413

414

my $index_prefix = "";

415

my @index_prefixes = ();

416

417

my $this_indexing_command;

418

419

sub print_index_info ()

420

{ check_args(0, @_);

421

my ($key, $val);

422

for my $file (sort keys %file_index_entries)

423

{ my %index_entries = %{$file_index_entries{$file}};

424

print STDERR "file: $file\n";

425

for my $aname (sort keys %index_entries)

426

{ my @entries = @{$index_entries{$aname}};

427

if (scalar(@entries) == 1)

428

{ print STDERR " $aname : $entries[0]\n"; }

429

else

430

{ print STDERR " $aname : ", join("\n " . (" " x length($aname)), @entries), "\n"; } } }

431

for my $file (sort keys %file_index_entries_broken)

432

{ my @entries = @{$file_index_entries_broken{$file}};

433

print STDERR "file: $file\n";

434

for my $entry (@entries)

435

{ print STDERR " $entry\n"; }

436

}

437

}

438

439

440

sub process_index_file ( $$ )

441

{ my ($file, $indexing_command) = check_args(2, @_);

442

# print "process_index_file $file $indexing_command\n";

443

444

my $he = file_to_tree($html_directory . $file);

445

# $he->dump();

446

447

$this_indexing_command = $indexing_command;

448

$he->traverse(\&process_if_index_dl_compact, 'ignore text');

449

undef $this_indexing_command;

450

# print "process_index_file done\n";

451

}

452

453

454

sub process_if_index_dl_compact ( $$$ )

455

{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument

456

if (!$startflag)

457

{ return; }

458

459

if (($he->tag() eq "dl") && (defined $he->attr('compact')))

460

{ process_index_dl_compact($he);

461

return 0; }

462

else

463

{ return 1; }

464

}

465

466

467

# The elements of a <DL COMPACT> list from a LaTeX2HTML index:

468

# * a single space: text to be ignored

469

# * <DT> elements with an optional <DD> element following each one

470

# Two types of <DT> elements:

471

# * Followed by a <DD> element: the <DT> contains a single

472

# string, and the <DD> contains a whitespace string to be ignored, a

473

# <DL COMPACT> to be recursively processed (with the <DT> string as a

474

# prefix), and a whitespace string to be ignored.

475

# * Not followed by a <DD> element: contains a list of anchors

476

# and texts (ignore the texts, which are only whitespace and commas).

477

# Optionally contains a <DL COMPACT> to be recursively processed (with

478

# the <DT> string as a prefix)

479

sub process_index_dl_compact ( $ )

480

{ my ($h) = check_args(1, @_);

481

my @content = @{$h->content()};

482

for (my $i = 0; $i < scalar(@content); $i++)

483

{ my $this_he = $content[$i];

484

if ($this_he->tag ne "dt")

485

{ $this_he->dump();

486

die "Expected <DT> tag: " . $this_he->tag; }

487

if (($i < scalar(@content) - 1) && ($content[$i+1]->tag eq "dd"))

488

{ process_index_dt_and_dd($this_he, $content[$i+1]);

489

$i++; }

490

else

491

{ process_index_lone_dt($this_he); } } }

492

493

494

495

# Argument is a <DT> element. If it contains more than one anchor, then

496

# the texts of all subsequent ones are "[Link]". Example:

497

# <DT>

498

# <A HREF="embedding.html#l2h-201">

499

# "$PATH"

500

# ", "

501

# <A HREF="embedding.html#l2h-205">

502

# "[Link]"

503

# Optionally contains a <DL COMPACT> as well. Example:

504

# <DT>

505

# <A HREF="types.html#l2h-616">

506

# "attribute"

507

# <DL COMPACT>

508

# <DT>

509

# <A HREF="assignment.html#l2h-3074">

510

# "assignment"

511

# ", "

512

# <A HREF="assignment.html#l2h-3099">

513

# "[Link]"

514

# <DT>

515

# <A HREF="types.html#l2h-">

516

# "assignment, class"

517

518

sub process_index_lone_dt ( $ )

519

{ my ($dt) = check_args(1, @_);

520

my @dtcontent = @{$dt->content()};

521

my $acontent;

522

my $acontent_suffix;

523

for my $a (@dtcontent)

524

{ if ($a eq ", ")

525

{ next; }

526

if (!ref $a)

527

{ $dt->dump;

528

die "Unexpected <DT> string element: $a"; }

529

530

if ($a->tag eq "dl")

531

{ push @index_prefixes, $index_prefix;

532

if (!defined $acontent_suffix)

533

{ die "acontent_suffix not yet defined"; }

534

$index_prefix .= $acontent_suffix . ", ";

535

process_index_dl_compact($a);

536

$index_prefix = pop(@index_prefixes);

537

return; }

538

539

if ($a->tag ne "a")

540

{ $dt->dump;

541

$a->dump;

542

die "Expected anchor in lone <DT>"; }

543

544

my ($aname, $ahref, @acontent) = anchor_info($a);

545

# unused $aname

546

if (scalar(@acontent) != 1)

547

{ die "Expected just one content of <A> in <DT>: @acontent"; }

548

if (ref $acontent[0])

549

{ $acontent[0]->dump;

550

die "Expected string content of <A> in <DT>: $acontent[0]"; }

551

if (!defined($acontent))

552

{ $acontent = $index_prefix . $acontent[0];

553

$acontent_suffix = $acontent[0]; }

554

elsif (($acontent[0] ne "[Link]") && ($acontent ne ($index_prefix . $acontent[0])))

555

{ die "Differing content: <<<$acontent>>>, <<<$acontent[0]>>>"; }

556

557

if (!defined $ahref)

558

{ $dt->dump;

559

die "no HREF in nachor in <DT>"; }

560

my ($ahref_file, $ahref_name) = split(/\#/, $ahref);

561

if (!defined $ahref_name)

562

{ # Reference to entire file

563

$ahref_name = ""; }

564

565

if ($ahref_name eq $l2h_broken_link_name)

566

{ if (!exists $file_index_entries_broken{$ahref_file})

567

{ $file_index_entries_broken{$ahref_file} = []; }

568

push @{$file_index_entries_broken{$ahref_file}}, "$this_indexing_command $acontent";

569

next; }

570

571

if (!exists $file_index_entries{$ahref_file})

572

{ $file_index_entries{$ahref_file} = {}; }

573

# Don't do this! It appears to make a copy, which is not desired.

574

# my %index_entries = %{$file_index_entries{$ahref_file}};

575

if (!exists $ {$file_index_entries{$ahref_file}}{$ahref_name})

576

{ $ {$file_index_entries{$ahref_file}}{$ahref_name} = []; }

577

# { my $oldcontent = $ {$file_index_entries{$ahref_file}}{$ahref_name};

578

# if ($acontent eq $oldcontent)

579

# { die "Multiple identical index entries?"; }

580

# die "Trying to add $acontent, but already have index entry pointing at $ahref_file\#$ahref_name: ${$file_index_entries{$ahref_file}}{$ahref_name}"; }

581

582

push @{$ {$file_index_entries{$ahref_file}}{$ahref_name}}, "$this_indexing_command $acontent";

583

# print STDERR "keys: ", keys %{$file_index_entries{$ahref_file}}, "\n";

584

}

585

}

586

587

sub process_index_dt_and_dd ( $$ )

588

{ my ($dt, $dd) = check_args(2, @_);

589

my $dtcontent;

590

{ my @dtcontent = @{$dt->content()};

591

if ((scalar(@dtcontent) != 1) || (ref $dtcontent[0]))

592

{ $dd->dump;

593

$dt->dump;

594

die "Expected single string (actual size = " . scalar(@dtcontent) . ") in content of <DT>: @dtcontent"; }

595

$dtcontent = $dtcontent[0];

596

$dtcontent =~ s/ +$//; }

597

my $ddcontent;

598

{ my @ddcontent = @{$dd->content()};

599

if (scalar(@ddcontent) != 1)

600

{ die "Expected single <DD> content, got ", scalar(@ddcontent), " elements:\n", join("\n", @ddcontent), "\n "; }

601

$ddcontent = $ddcontent[0]; }

602

if ($ddcontent->tag ne "dl")

603

{ die "Expected <DL> as content of <DD>, but saw: $ddcontent"; }

604

605

push @index_prefixes, $index_prefix;

606

$index_prefix .= $dtcontent . ", ";

607

process_index_dl_compact($ddcontent);

608

$index_prefix = pop(@index_prefixes);

609

}

610

611

612

###########################################################################

613

### Ordinary sections

614

###

615

616

sub process_section_file ( $$$ )

617

{ my ($file, $depth, $nodetitle) = check_args(3, @_);

618

my $he = file_to_tree(($file =~ /^\//) ? $file : $html_directory . $file);

619

620

# print STDERR "process_section_file: $file $depth $nodetitle\n";

621

622

# Equivalently:

623

# while ($depth >= scalar(@section_stack)) { pop(@section_stack); }

624

@section_stack = @section_stack[0..$depth-1];

625

626

# Not a great nodename fixup scheme; need a more global view

627

if ((defined $contents_fixups{$nodetitle})

628

&& (scalar(@section_stack) > 0))

629

{ my $up_title = $section_stack[$#section_stack];

630

# hack for Python Standard Library

631

$up_title =~ s/^(Built-in|Standard) Module //g;

632

my ($up_first_word) = split(/ /, $up_title);

633

$nodetitle = "$up_first_word $nodetitle";

634

}

635

636

push @section_stack, $nodetitle;

637

# print STDERR "new section_stack: ", join(", ", @section_stack), "\n";

638

639

$he->traverse(\&process_if_child_links, 'ignore text');

640

%footnotes = ();

641

# $he->dump;

642

$he->traverse(\&process_if_footnotes, 'ignore text');

643

644

# $he->dump;

645

646

if (exists $file_index_entries{$file})

647

{ %this_index_entries = %{$file_index_entries{$file}};

648

# print STDERR "this_index_entries:\n ", join("\n ", keys %this_index_entries), "\n";

649

}

650

else

651

{ # print STDERR "Warning: no index entries for file $file\n";

652

%this_index_entries = (); }

653

654

if (exists $file_index_entries_broken{$file})

655

{ @this_index_entries_broken = @{$file_index_entries_broken{$file}}; }

656

else

657

{ # print STDERR "Warning: no index entries for file $file\n";

658

@this_index_entries_broken = (); }

659

660

661

if ($he->tag() ne "html")

662

{ die "Expected <HTML> at top level"; }

663

my @content = @{$he->content()};

664

if ((!ref $content[0]) or ($content[0]->tag ne "head"))

665

{ $he->dump;

666

die "<HEAD> not first element of <HTML>"; }

667

if ((!ref $content[1]) or ($content[1]->tag ne "body"))

668

{ $he->dump;

669

die "<BODY> not second element of <HTML>"; }

670

671

$content[1]->traverse(\&output_body);

672

}

673

674

# stack of things we're inside that are preventing indexing from occurring now.

675

# These are "h1", "h2", "h3", "h4", "h5", "h6", "dt" (and possibly others?)

676

my @index_deferrers = ();

677

678

sub push_or_pop_index_deferrers ( $$ )

679

{ my ($tag, $startflag) = check_args(2, @_);

680

if ($startflag)

681

{ push @index_deferrers, $tag; }

682

else

683

{ my $old_deferrer = pop @index_deferrers;

684

if ($tag ne $old_deferrer)

685

{ die "Expected $tag at top of index_deferrers but saw $old_deferrer; remainder = ", join(" ", @index_deferrers); }

686

do_deferred_index_entries(); }

687

}

688

689

690

sub label_add_index_entries ( $;$ )

691

{ my ($label, $he) = check_args_range(1, 2, @_);

692

# print ((exists $this_index_entries{$label}) ? "*" : " "), " label_add_index_entries $label\n";

693

# $he is the anchor element

694

if (exists $this_index_entries{$label})

695

{ push @deferred_index_entries, @{$this_index_entries{$label}};

696

return; }

697

698

if ($label eq $l2h_broken_link_name)

699

{ # Try to find some text to use in guessing which links should point here

700

# I should probably only look at the previous element, or if that is

701

# all punctuation, the one before it; collecting all the previous texts

702

# is a bit of overkill.

703

my @anchor_texts = collect_texts($he);

704

my @previous_texts = collect_texts($he->parent, $he);

705

# 4 elements is arbitrary; ought to filter out punctuation and small words

706

# first, then perhaps keep fewer. Perhaps also filter out formatting so

707

# that we can see a larger chunk of text? (Probably not.)

708

# Also perhaps should do further chunking into words, in case the

709

# index term isn't a chunk of its own (eg, was in <tt>...</tt>.

710

my @candidate_texts = (@anchor_texts, (reverse(@previous_texts))[0..min(3,$#previous_texts)]);

711

712

my $guessed = 0;

713

for my $text (@candidate_texts)

714

{ # my $orig_text = $text;

715

if ($text =~ /^[\"\`\'().?! ]*$/)

716

{ next; }

717

if (length($text) <= 2)

718

{ next; }

719

# hack for Python manual; maybe defer until failure first time around?

720

$text =~ s/^sys\.//g;

721

for my $iterm (@this_index_entries_broken)

722

{ # I could test for zero: LaTeX2HTML's failures in the Python

723

# documentation are only for items of the form "... (built-in...)"

724

if (index($iterm, $text) != -1)

725

{ push @deferred_index_entries, $iterm;

726

# print STDERR "Guessing index term `$iterm' for text `$orig_text'\n";

727

$guessed = 1;

728

} } }

729

if (!$guessed)

730

{ # print STDERR "No guess in `", join("'; `", @this_index_entries_broken), "' for texts:\n `", join("'\n `", @candidate_texts), "'\n";

731

}

732

}

733

}

734

735

736

# Need to add calls to this at various places.

737

# Perhaps add HTML::Element argument and do the check for appropriateness

738

# here (ie, no action if inside <H1>, etc.).

739

sub do_deferred_index_entries ()

740

{ check_args(0, @_);

741

if ((scalar(@deferred_index_entries) > 0)

742

&& (scalar(@index_deferrers) == 0))

743

{ print TEXI "\n", join("\n", @deferred_index_entries), "\n";

744

@deferred_index_entries = (); }

745

}

746

747

my $table_columns; # undefined if not in a table

748

my $table_first_column; # boolean

749

750

sub output_body ( $$$ )

751

{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument

752

753

if (!ref $he)

754

{ my $space_index = index($he, " ");

755

if ($space_index != -1)

756

{ # Why does

757

# print TEXI texi_quote(substr($he, 0, $space_index+1));

758

# give: Can't locate object method "TEXI" via package "texi_quote"

759

# (Because the definition texi_quote hasn't been seen yet.)

760

print TEXI &texi_quote(substr($he, 0, $space_index+1));

761

do_deferred_index_entries();

762

print TEXI &texi_quote(substr($he, $space_index+1)); }

763

else

764

{ print TEXI &texi_quote($he); }

765

return; }

766

767

my $tag = $he->tag();

768

769

# Ordinary text markup first

770

if (exists $inline_markup{$tag})

771

{ if ($startflag)

772

{ print TEXI "\@$inline_markup{$tag}\{"; }

773

else

774

{ print TEXI "\}"; } }

775

elsif ($tag eq "a")

776

{ my ($name, $href, @content) = anchor_info($he);

777

if (!$href)

778

{ # This anchor is only here for indexing/cross referencing purposes.

779

if ($startflag)

780

{ label_add_index_entries($name, $he); }

781

}

782

elsif ($href =~ "^(ftp|http|news):")

783

{ if ($startflag)

784

{ # Should avoid second argument if it's identical to the URL.

785

print TEXI "\@uref\{$href, "; }

786

else

787

{ print TEXI "\}"; }

788

}

789

elsif ($href =~ /^\#(foot[0-9]+)$/)

790

{ # Footnote

791

if ($startflag)

792

{ # Could double-check name and content, but I'm not

793

# currently storing that information.

794

print TEXI "\@footnote\{";

795

$footnotes{$1}->traverse(\&output_body);

796

print TEXI "\}";

797

return 0; } }

798

else

799

{ if ($startflag)

800

{ # cross-references are not active Info links, but no text is lost

801

print STDERR "Can't deal with internal HREF anchors yet:\n";

802

$he->dump; }

803

}

804

}

805

elsif ($tag eq "br")

806

{ print TEXI "\@\n"; }

807

elsif ($tag eq "body")

808

{ }

809

elsif ($tag eq "center")

810

{ if (has_single_content_string($he)

811

&& ($ {$he->content}[0] =~ /^ *$/))

812

{ return 0; }

813

if ($startflag)

814

{ print TEXI "\n\@center\n"; }

815

else

816

{ print TEXI "\n\@end center\n"; }

817

}

818

elsif ($tag eq "div")

819

{ my $align = $he->attr('align');

820

if (defined($align) && ($align eq "center"))

821

{ if (has_single_content_string($he)

822

&& ($ {$he->content}[0] =~ /^ *$/))

823

{ return 0; }

824

if ($startflag)

825

{ print TEXI "\n\@center\n"; }

826

else

827

{ print TEXI "\n\@end center\n"; } }

828

}

829

elsif ($tag eq "dl")

830

{ # Recognize "<dl><dd><pre> ... </pre></dl>" paradigm for "@example"

831

if (has_single_content_with_tag($he, "dd"))

832

{ my $he_dd = $ {$he->content}[0];

833

if (has_single_content_with_tag($he_dd, "pre"))

834

{ my $he_pre = $ {$he_dd->content}[0];

835

print_pre($he_pre);

836

return 0; } }

837

if ($startflag)

838

{ # Could examine the elements, to be cleverer about formatting.

839

# (Also to use ftable, vtable...)

840

print TEXI "\n\@table \@asis\n"; }

841

else

842

{ print TEXI "\n\@end table\n"; }

843

}

844

elsif ($tag eq "dt")

845

{ push_or_pop_index_deferrers($tag, $startflag);

846

if ($startflag)

847

{ print TEXI "\n\@item "; }

848

else

849

{ } }

850

elsif ($tag eq "dd")

851

{ if ($startflag)

852

{ print TEXI "\n"; }

853

else

854

{ }

855

if (scalar(@index_deferrers) != 0)

856

{ $he->dump;

857

die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }

858

do_deferred_index_entries();

859

}

860

elsif ($tag =~ /^(font|big|small)$/)

861

{ # Do nothing for now.

862

}

863

elsif ($tag =~ /^h[1-6]$/)

864

{ # We don't need this because we never recursively enter the heading content.

865

# push_or_pop_index_deferrers($tag, $startflag);

866

my $secname = "";

867

my @seclabels = ();

868

for my $elt (@{$he->content})

869

{ if (!ref $elt)

870

{ $secname .= $elt; }

871

elsif ($elt->tag eq "br")

872

{ }

873

elsif ($elt->tag eq "a")

874

{ my ($name, $href, @acontent) = anchor_info($elt);

875

if ($href)

876

{ $he->dump;

877

$elt->dump;

878

die "Nonsimple anchor in <$tag>"; }

879

if (!defined $name)

880

{ die "No NAME for anchor in $tag"; }

881

push @seclabels, $name;

882

for my $subelt (@acontent)

883

{ $secname .= html_to_texi($subelt); } }

884

else

885

{ $secname .= html_to_texi($elt); } }

886

if ($secname eq "")

887

{ die "No section name in <$tag>"; }

888

if (scalar(@section_stack) == 1)

889

{ if ($section_stack[-1] ne "Top")

890

{ die "Not top? $section_stack[-1]"; }

891

print TEXI "\@settitle $secname\n";

892

print TEXI "\@c %**end of header\n";

893

print TEXI "\n";

894

print TEXI "\@node Top\n";

895

print TEXI "\n"; }

896

else

897

{ print TEXI "\n\@node $section_stack[-1]\n";

898

print TEXI "\@$sectionmarker[scalar(@section_stack)-1] ", texi_remove_punctuation($secname), "\n"; }

899

for my $seclabel (@seclabels)

900

{ label_add_index_entries($seclabel); }

901

# This should only happen once per file.

902

label_add_index_entries("");

903

if (scalar(@index_deferrers) != 0)

904

{ $he->dump;

905

die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }

906

do_deferred_index_entries();

907

return 0;

908

}

909

elsif ($tag eq "hr")

910

{ }

911

elsif ($tag eq "ignore")

912

{ # Hack for ignored elements

913

return 0;

914

}

915

elsif ($tag eq "li")

916

{ if ($startflag)

917

{ print TEXI "\n\n\@item\n";

918

do_deferred_index_entries(); } }

919

elsif ($tag eq "ol")

920

{ if ($startflag)

921

{ print TEXI "\n\@enumerate \@bullet\n"; }

922

else

923

{ print TEXI "\n\@end enumerate\n"; } }

924

elsif ($tag eq "p")

925

{ if ($startflag)

926

{ print TEXI "\n\n"; }

927

if (scalar(@index_deferrers) != 0)

928

{ $he->dump;

929

die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }

930

do_deferred_index_entries(); }

931

elsif ($tag eq "pre")

932

{ print_pre($he);

933

return 0; }

934

elsif ($tag eq "table")

935

{ # Could also indicate common formatting for first column, or

936

# determine relative widths for columns (or determine a prototype row)

937

if ($startflag)

938

{ if (defined $table_columns)

939

{ $he->dump;

940

die "Can't deal with table nested inside $table_columns-column table"; }

941

$table_columns = table_columns($he);

942

if ($table_columns < 2)

943

{ $he->dump;

944

die "Column with $table_columns columns?"; }

945

elsif ($table_columns == 2)

946

{ print TEXI "\n\@table \@asis\n"; }

947

else

948

{ print TEXI "\n\@multitable \@columnfractions";

949

for (my $i=0; $i<$table_columns; $i++)

950

{ print TEXI " ", 1.0/$table_columns; }

951

print TEXI "\n"; } }

952

else

953

{ if ($table_columns == 2)

954

{ print TEXI "\n\@end table\n"; }

955

else

956

{ print TEXI "\n\@end multitable\n"; }

957

undef $table_columns; } }

958

elsif (($tag eq "td") || ($tag eq "th"))

959

{ if ($startflag)

960

{ if ($table_first_column)

961

{ print TEXI "\n\@item ";

962

$table_first_column = 0; }

963

elsif ($table_columns > 2)

964

{ print TEXI "\n\@tab "; } }

965

else

966

{ print TEXI "\n"; } }

967

elsif ($tag eq "tr")

968

{ if ($startflag)

969

{ $table_first_column = 1; } }

970

elsif ($tag eq "ul")

971

{ if ($startflag)

972

{ print TEXI "\n\@itemize \@bullet\n"; }

973

else

974

{ print TEXI "\n\@end itemize\n"; } }

975

else

976

{ # I used to have a newline before "output_body" here.

977

print STDERR "output_body: ignoring <$tag> tag\n";

978

$he->dump;

979

return 0; }

980

981

return 1;

982

}

983

984

sub print_pre ( $ )

985

{ my ($he_pre) = check_args(1, @_);

986

if (!has_single_content_string($he_pre))

987

{ die "Multiple or non-string content for <PRE>: ", @{$he_pre->content}; }

988

my $pre_content = $ {$he_pre->content}[0];

989

print TEXI "\n\@example";

990

print TEXI &texi_quote($pre_content);

991

print TEXI "\@end example\n";

992

}

993

994

sub table_columns ( $ )

995

{ my ($table) = check_args(1, @_);

996

my $result = 0;

997

for my $row (@{$table->content})

998

{ if ($row->tag ne "tr")

999

{ $table->dump;

1000

$row->dump;

1001

die "Expected <TR> as table row."; }

1002

$result = max($result, scalar(@{$row->content})); }

1003

return $result;

1004

}

1005

1006

1007

###########################################################################

1008

### Utilities

1009

###

1010

1011

sub min ( $$ )

1012

{ my ($x, $y) = check_args(2, @_);

1013

return ($x < $y) ? $x : $y;

1014

}

1015

1016

sub max ( $$ )

1017

{ my ($x, $y) = check_args(2, @_);

1018

return ($x > $y) ? $x : $y;

1019

}

1020

1021

sub file_to_tree ( $ )

1022

{ my ($file) = check_args(1, @_);

1023

1024

my $tree = new HTML::TreeBuilder;

1025

$tree->ignore_unknown(1);

1026

# $tree->warn(1);

1027

$tree->parse_file($file);

1028

cleanup_parse_tree($tree);

1029

return $tree

1030

}

1031

1032

1033

sub has_single_content ( $ )

1034

{ my ($he) = check_args(1, @_);

1035

if (!ref $he)

1036

{ # return 0;

1037

die "Non-reference argument: $he"; }

1038

my $ref_content = $he->content;

1039

if (!defined $ref_content)

1040

{ return 0; }

1041

my @content = @{$ref_content};

1042

if (scalar(@content) != 1)

1043

{ return 0; }

1044

return 1;

1045

}

1046

1047

1048

# Return true if the content of the element contains only one element itself,

1049

# and that inner element has the specified tag.

1050

sub has_single_content_with_tag ( $$ )

1051

{ my ($he, $tag) = check_args(2, @_);

1052

if (!has_single_content($he))

1053

{ return 0; }

1054

my $content = $ {$he->content}[0];

1055

if (!ref $content)

1056

{ return 0; }

1057

my $content_tag = $content->tag;

1058

if (!defined $content_tag)

1059

{ return 0; }

1060

return $content_tag eq $tag;

1061

}

1062

1063

sub has_single_content_string ( $ )

1064

{ my ($he) = check_args(1, @_);

1065

if (!has_single_content($he))

1066

{ return 0; }

1067

my $content = $ {$he->content}[0];

1068

if (ref $content)

1069

{ return 0; }

1070

return 1;

1071

}

1072

1073

1074

# Return name, href, content. First two may be undefined; third is an array.

1075

# I don't see how to determine if there are more attributes.

1076

sub anchor_info ( $ )

1077

{ my ($he) = check_args(1, @_);

1078

if ($he->tag ne "a")

1079

{ $he->dump;

1080

die "passed non-anchor to anchor_info"; }

1081

my $name = $he->attr('name');

1082

my $href = $he->attr('href');

1083

my @content = ();

1084

{ my $ref_content = $he->content;

1085

if (defined $ref_content)

1086

{ @content = @{$ref_content}; } }

1087

return ($name, $href, @content);

1088

}

1089

1090

1091

sub texi_quote ( $ )

1092

{ my ($text) = check_args(1, @_);

1093

$text =~ s/([\@\{\}])/\@$1/g;

1094

$text =~ s/ -- / --- /g;

1095

return $text;

1096

}

1097

1098

# Eliminate bad punctuation (that confuses Makeinfo or Info) for section titles.

1099

sub texi_remove_punctuation ( $ )

1100

{ my ($text) = check_args(1, @_);

1101

1102

$text =~ s/^ +//g;

1103

$text =~ s/[ :]+$//g;

1104

$text =~ s/^[1-9][0-9.]* +//g;

1105

$text =~ s/,//g;

1106

# Both embedded colons and " -- " confuse makeinfo. (Perhaps " -- "

1107

# gets converted into " - ", just as "---" would be converted into " -- ",

1108

# so the names end up differing.)

1109

# $text =~ s/:/ -- /g;

1110

$text =~ s/://g;

1111

return $text;

1112

}

1113

1114

1115

## Do not use this inside `traverse': it throws off the traversal. Use

1116

## html_replace_by_ignore or html_replace_by_meta instead.

1117

# Returns 1 if success, 0 if failure.

1118

sub html_remove ( $;$ )

1119

{ my ($he, $parent) = check_args_range(1, 2, @_);

1120

if (!defined $parent)

1121

{ $parent = $he->parent; }

1122

my $ref_pcontent = $parent->content;

1123

my @pcontent = @{$ref_pcontent};

1124

for (my $i=0; $i<scalar(@pcontent); $i++)

1125

{ if ($pcontent[$i] eq $he)

1126

{ splice @{$ref_pcontent}, $i, 1;

1127

$he->parent(undef);

1128

return 1; } }

1129

die "Didn't find $he in $parent";

1130

}

1131

1132

1133

sub html_replace ( $$;$ )

1134

{ my ($orig, $new, $parent) = check_args_range(2, 3, @_);

1135

if (!defined $parent)

1136

{ $parent = $orig->parent; }

1137

my $ref_pcontent = $parent->content;

1138

my @pcontent = @{$ref_pcontent};

1139

for (my $i=0; $i<scalar(@pcontent); $i++)

1140

{ if ($pcontent[$i] eq $orig)

1141

{ $ {$ref_pcontent}[$i] = $new;

1142

$new->parent($parent);

1143

$orig->parent(undef);

1144

return 1; } }

1145

die "Didn't find $orig in $parent";

1146

}

1147

1148

sub html_replace_by_meta ( $;$ )

1149

{ my ($orig, $parent) = check_args_range(1, 2, @_);

1150

my $meta = new HTML::Element "meta";

1151

if (!defined $parent)

1152

{ $parent = $orig->parent; }

1153

return html_replace($orig, $meta, $parent);

1154

}

1155

1156

sub html_replace_by_ignore ( $;$ )

1157

{ my ($orig, $parent) = check_args_range(1, 2, @_);

1158

my $ignore = new HTML::Element "ignore";

1159

if (!defined $parent)

1160

{ $parent = $orig->parent; }

1161

return html_replace($orig, $ignore, $parent);

1162

}

1163

1164

1165

1166

###

1167

### Collect text elements

1168

###

1169

1170

my @collected_texts;

1171

my $collect_texts_stoppoint;

1172

my $done_collecting;

1173

1174

sub collect_texts ( $;$ )

1175

{ my ($root, $stop) = check_args_range(1, 2, @_);

1176

# print STDERR "collect_texts: $root $stop\n";

1177

$collect_texts_stoppoint = $stop;

1178

$done_collecting = 0;

1179

@collected_texts = ();

1180

$root->traverse(\&collect_if_text); # process texts

1181

# print STDERR "collect_texts => ", join(";;;", @collected_texts), "\n";

1182

return @collected_texts;

1183

}

1184

1185

sub collect_if_text ( $$$ )

1186

{ my $he = (check_args(3, @_))[0]; # ignore depth and startflag arguments

1187

if ($done_collecting)

1188

{ return 0; }

1189

if (!defined $he)

1190

{ return 0; }

1191

if (!ref $he)

1192

{ push @collected_texts, $he;

1193

return 0; }

1194

if ((defined $collect_texts_stoppoint) && ($he eq $collect_texts_stoppoint))

1195

{ $done_collecting = 1;

1196

return 0; }

1197

return 1;

1198

}

1199

1200

1201

###########################################################################

1202

### Clean up parse tree

1203

###

1204

1205

sub cleanup_parse_tree ( $ )

1206

{ my ($he) = check_args(1, @_);

1207

$he->traverse(\&delete_if_navigation, 'ignore text');

1208

$he->traverse(\&delete_extra_spaces, 'ignore text');

1209

$he->traverse(\&merge_dl, 'ignore text');

1210

$he->traverse(\&reorder_dt_and_dl, 'ignore text');

1211

return $he;

1212

}

1213

1214

1215

## Simpler version that deletes contents but not the element itself.

1216

# sub delete_if_navigation ( $$$ )

1217

# { my $he = (check_args(3, @_))[0]; # ignore startflag and depth

1218

# if (($he->tag() eq "div") && ($he->attr('class') eq 'navigation'))

1219

# { $he->delete();

1220

# return 0; }

1221

# else

1222

# { return 1; }

1223

# }

1224

1225

sub delete_if_navigation ( $$$ )

1226

{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument

1227

if (!$startflag)

1228

{ return; }

1229

1230

if (($he->tag() eq "div") && (defined $he->attr('class')) && ($he->attr('class') eq 'navigation'))

1231

{ my $ref_pcontent = $he->parent()->content();

1232

# Don't try to modify @pcontent, which appears to be a COPY.

1233

# my @pcontent = @{$ref_pcontent};

1234

for (my $i = 0; $i<scalar(@{$ref_pcontent}); $i++)

1235

{ if (${$ref_pcontent}[$i] eq $he)

1236

{ splice(@{$ref_pcontent}, $i, 1);

1237

last; } }

1238

$he->delete();

1239

return 0; }

1240

else

1241

{ return 1; }

1242

}

1243

1244

sub delete_extra_spaces ( $$$ )

1245

{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument

1246

if (!$startflag)

1247

{ return; }

1248

1249

my $tag = $he->tag;

1250

if ($tag =~ /^(head|html|table|tr|ul)$/)

1251

{ delete_child_spaces($he); }

1252

delete_trailing_spaces($he);

1253

return 1;

1254

}

1255

1256

1257

sub delete_child_spaces ( $ )

1258

{ my ($he) = check_args(1, @_);

1259

my $ref_content = $he->content();

1260

for (my $i = 0; $i<scalar(@{$ref_content}); $i++)

1261

{ if ($ {$ref_content}[$i] =~ /^ *$/)

1262

{ splice(@{$ref_content}, $i, 1);

1263

$i--; } }

1264

}

1265

1266

sub delete_trailing_spaces ( $ )

1267

{ my ($he) = check_args(1, @_);

1268

my $ref_content = $he->content();

1269

if (! defined $ref_content)

1270

{ return; }

1271

# Could also check for previous element = /^h[1-6]$/.

1272

for (my $i = 0; $i<scalar(@{$ref_content})-1; $i++)

1273

{ if ($ {$ref_content}[$i] =~ /^ *$/)

1274

{ my $next_elt = $ {$ref_content}[$i+1];

1275

if ((ref $next_elt) && ($next_elt->tag =~ /^(br|dd|dl|dt|hr|p|ul)$/))

1276

{ splice(@{$ref_content}, $i, 1);

1277

$i--; } } }

1278

if ($he->tag =~ /^(dd|dt|^h[1-6]|li|p)$/)

1279

{ my $last_elt = $ {$ref_content}[$#{$ref_content}];

1280

if ((defined $last_elt) && ($last_elt =~ /^ *$/))

1281

{ pop @{$ref_content}; } }

1282

}

1283

1284

1285

# LaTeX2HTML sometimes creates

1286

# <DT>text

1287

# <DL COMPACT><DD>text

1288

# which should actually be:

1289

# <DL COMPACT>

1290

# <DT>text

1291

# <DD>text

1292

# Since a <DL> gets added, this ends up looking like

1293

#

1294

# <DL>

1295

# <DT>

1296

# text1...

1297

# <DL COMPACT>

1298

# <DD>

1299

# text2...

1300

# dt_or_dd1...

1301

# dt_or_dd2...

1302

# which should become

1303

#

1304

# <DL COMPACT>

1305

# <DT>

1306

# text1...

1307

# <DD>

1308

# text2...

1309

# dt_or_dd1...

1310

# dt_or_dd2...

1311

1312

sub reorder_dt_and_dl ( $$$ )

1313

{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument

1314

if (!$startflag)

1315

{ return; }

1316

1317

if ($he->tag() eq "p")

1318

{ my $ref_pcontent = $he->content();

1319

if (defined $ref_pcontent)

1320

{ my @pcontent = @{$ref_pcontent};

1321

# print "reorder_dt_and_dl found a \n"; $he->dump();

1322

if ((scalar(@pcontent) >= 1)

1323

&& (ref $pcontent[0]) && ($pcontent[0]->tag() eq "dl")

1324

&& $pcontent[0]->implicit())

1325

{ my $ref_dlcontent = $pcontent[0]->content();

1326

# print "reorder_dt_and_dl found a and implicit <dl>\n";

1327

if (defined $ref_dlcontent)

1328

{ my @dlcontent = @{$ref_dlcontent};

1329

if ((scalar(@dlcontent) >= 1)

1330

&& (ref $dlcontent[0]) && ($dlcontent[0]->tag() eq "dt"))

1331

{ my $ref_dtcontent = $dlcontent[0]->content();

1332

# print "reorder_dt_and_dl found a , implicit <dl>, and <dt>\n";

1333

if (defined $ref_dtcontent)

1334

{ my @dtcontent = @{$ref_dtcontent};

1335

if ((scalar(@dtcontent) > 0)

1336

&& (ref $dtcontent[$#dtcontent])

1337

&& ($dtcontent[$#dtcontent]->tag() eq "dl"))

1338

{ my $ref_dl2content = $dtcontent[$#dtcontent]->content();

1339

# print "reorder_dt_and_dl found a , implicit <dl>, <dt>, and <dl>\n";

1340

if (defined $ref_dl2content)

1341

{ my @dl2content = @{$ref_dl2content};

1342

if ((scalar(@dl2content) > 0)

1343

&& (ref ($dl2content[0]))

1344

&& ($dl2content[0]->tag() eq "dd"))

1345

{

1346

# print "reorder_dt_and_dl found a , implicit <dl>, <dt>, <dl>, and <dd>\n";

1347

# print STDERR "CHANGING\n"; $he->dump();

1348

html_replace_by_ignore($dtcontent[$#dtcontent]);

1349

splice(@{$ref_dlcontent}, 1, 0, @dl2content);

1350

# print STDERR "CHANGED TO:\n"; $he->dump();

1351

return 0; # don't traverse children

1352

} } } } } } } } }

1353

return 1;

1354

}

1355

1356

1357

# If we find a paragraph that looks like

1358

#

1359

# <HR>

1360

# <UL>

1361

# then accumulate its links into a contents_list and delete the paragraph.

1362

sub process_if_child_links ( $$$ )

1363

{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument

1364

if (!$startflag)

1365

{ return; }

1366

1367

if ($he->tag() eq "p")

1368

{ my $ref_content = $he->content();

1369

if (defined $ref_content)

1370

{ my @content = @{$ref_content};

1371

if ((scalar(@content) == 2)

1372

&& (ref $content[0]) && $content[0]->tag() eq "hr"

1373

&& (ref $content[1]) && $content[1]->tag() eq "ul")

1374

{ process_child_links($he);

1375

$he->delete();

1376

return 0; } } }

1377

return 1;

1378

}

1379

1380

1381

# If we find

1382

# <H4>

1383

# "Footnotes"

1384

# <DL>

1385

# <DT>

1386

# <A NAME="foot560">

1387

# "...borrow"

1388

# <A HREF="refcountsInPython.html#tex2html2" NAME="foot560">

1389

# "1.2"

1390

# <DD>

1391

# "The metaphor of ``borrowing'' a reference is not completely correct: the owner still has a copy of the reference. "

1392

# ...

1393

# then record the footnote information and delete the section and list.

1394

1395

my $process_if_footnotes_expect_dl_next = 0;

1396

1397

sub process_if_footnotes ( $$$ )

1398

{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument

1399

if (!$startflag)

1400

{ return; }

1401

1402

if (($he->tag() eq "h4")

1403

&& has_single_content_string($he)

1404

&& ($ {$he->content}[0] eq "Footnotes"))

1405

{ html_replace_by_ignore($he);

1406

$process_if_footnotes_expect_dl_next = 1;

1407

return 0; }

1408

1409

if ($process_if_footnotes_expect_dl_next && ($he->tag() eq "dl"))

1410

{ my $ref_content = $he->content();

1411

if (defined $ref_content)

1412

{ $process_if_footnotes_expect_dl_next = 0;

1413

my @content = @{$ref_content};

1414

for (my $i=0; $i<$#content; $i+=2)

1415

{ my $he_dt = $content[$i];

1416

my $he_dd = $content[$i+1];

1417

if (($he_dt->tag ne "dt") || ($he_dd->tag ne "dd"))

1418

{ $he->dump;

1419

die "expected <DT> and <DD> at positions $i and ", $i+1; }

1420

my @dt_content = @{$he_dt->content()};

1421

if ((scalar(@dt_content) != 2)

1422

|| ($dt_content[0]->tag ne "a")

1423

|| ($dt_content[1]->tag ne "a"))

1424

{ $he_dt->dump;

1425

die "Expected 2 anchors as content of <DT>"; }

1426

my ($dt1_name, $dt1_href, $dt1_content) = anchor_info($dt_content[0]);

1427

my ($dt2_name, $dt2_href, $dt2_content) = anchor_info($dt_content[0]);

1428

# unused: $dt1_href, $dt1_content, $dt2_href, $dt2_content

1429

if ($dt1_name ne $dt2_name)

1430

{ $he_dt->dump;

1431

die "Expected identical names for anchors"; }

1432

html_replace_by_ignore($he_dd);

1433

$he_dd->tag("div"); # has no effect

1434

$footnotes{$dt1_name} = $he_dd; }

1435

html_replace_by_ignore($he);

1436

return 0; } }

1437

1438

if ($process_if_footnotes_expect_dl_next)

1439

{ $he->dump;

1440

die "Expected <DL> for footnotes next"; }

1441

1442

return 1;

1443

}

1444

1445

1446

1447

## Merge two adjacent paragraphs containing <DL> items, such as:

1448

#

1449

# <DL>

1450

# <DT>

1451

# ...

1452

# <DD>

1453

# ...

1454

#

1455

# <DL>

1456

# <DT>

1457

# ...

1458

# <DD>

1459

# ...

1460

1461

sub merge_dl ( $$$ )

1462

{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument

1463

if (!$startflag)

1464

{ return; }

1465

1466

my $ref_content = $he->content;

1467

if (!defined $ref_content)

1468

{ return; }

1469

my $i = 0;

1470

while ($i < scalar(@{$ref_content})-1)

1471

{ my $p1 = $ {$ref_content}[$i];

1472

if ((ref $p1) && ($p1->tag eq "p")

1473

&& has_single_content_with_tag($p1, "dl"))

1474

{ my $dl1 = $ {$p1->content}[0];

1475

# In this loop, rhs, not lhs, of < comparison changes,

1476

# because we are removing elements from the content of $he.

1477

while ($i < scalar(@{$ref_content})-1)

1478

{ my $p2 = $ {$ref_content}[$i+1];

1479

if (!((ref $p2) && ($p2->tag eq "p")

1480

&& has_single_content_with_tag($p2, "dl")))

1481

{ last; }

1482

# Merge these two elements.

1483

splice(@{$ref_content}, $i+1, 1); # remove $p2

1484

my $dl2 = $ {$p2->content}[0];

1485

$dl1->push_content(@{$dl2->content}); # put $dl2's content in $dl1

1486

}

1487

# extra increment because next element isn't a candidate for $p1

1488

$i++; }

1489

$i++; }

1490

return 1;

1491

}

1492

1493

1494

1495

###########################################################################

1496

### Testing

1497

###

1498

1499

sub test ( $$ )

1500

{ my ($action, $file) = check_args(2, @_);

1501

1502

# General testing

1503

if (($action eq "view") || ($action eq ""))

1504

{ # # $file = "/homes/gws/mernst/www/links.html";

1505

# # $file = "/homes/gws/mernst/www/index.html";

1506

# # $file = "/homes/fish/mernst/java/gud/doc/manual.html";

1507

# # $file = "/projects/cecil/cecil/doc/manuals/stdlib-man/stdlib/stdlib.html";

1508

# # $file = "/homes/fish/mernst/tmp/python-doc/html/index.html";

1509

# $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html";

1510

my $tree = file_to_tree($file);

1511

1512

## Testing

1513

# print STDERR $tree->as_HTML;

1514

$tree->dump();

1515

1516

# print STDERR $tree->tag(), "\n";

1517

# print STDERR @{$tree->content()}, "\n";

1518

1519

# for (@{ $tree->extract_links(qw(a img)) }) {

1520

# my ($link, $linkelem) = @$_;

1521

# print STDERR "$link ", $linkelem->as_HTML;

1522

# }

1523

1524

# print STDERR @{$tree->extract_links()}, "\n";

1525

1526

# my @top_level_elts = @{$tree->content()};

1527

1528

# if scalar(@{$tree->content()})

1529

return;

1530

}

1531

1532

elsif ($action eq "raw")

1533

{ my $tree = new HTML::TreeBuilder;

1534

$tree->ignore_unknown(1);

1535

# $tree->warn(1);

1536

$tree->parse_file($file);

1537

1538

$tree->dump();

1539

1540

# cleanup_parse_tree($tree);

1541

# $tree->dump();

1542

return;

1543

}

1544

1545

# Test dealing with a section.

1546

elsif ($action eq "section")

1547

{ # my $file;

1548

# $file = "/homes/fish/mernst/tmp/python-doc/html/api/intro.html";

1549

# $file = "/homes/fish/mernst/tmp/python-doc/html/api/includes.html";

1550

# $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html";

1551

process_section_file($file, 0, "Title");

1552

}

1553

1554

# Test dealing with many sections

1555

elsif (0)

1556

{ my @files = ("/homes/fish/mernst/tmp/python-doc/html/api/about.html",

1557

"/homes/fish/mernst/tmp/python-doc/html/api/abstract.html",

1558

"/homes/fish/mernst/tmp/python-doc/html/api/api.html",

1559

"/homes/fish/mernst/tmp/python-doc/html/api/cObjects.html",

1560

"/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html",

1561

"/homes/fish/mernst/tmp/python-doc/html/api/concrete.html",

1562

# "/homes/fish/mernst/tmp/python-doc/html/api/contents.html",

1563

"/homes/fish/mernst/tmp/python-doc/html/api/countingRefs.html",

1564

"/homes/fish/mernst/tmp/python-doc/html/api/debugging.html",

1565

"/homes/fish/mernst/tmp/python-doc/html/api/dictObjects.html",

1566

"/homes/fish/mernst/tmp/python-doc/html/api/embedding.html",

1567

"/homes/fish/mernst/tmp/python-doc/html/api/exceptionHandling.html",

1568

"/homes/fish/mernst/tmp/python-doc/html/api/exceptions.html",

1569

"/homes/fish/mernst/tmp/python-doc/html/api/fileObjects.html",

1570

"/homes/fish/mernst/tmp/python-doc/html/api/floatObjects.html",

1571

"/homes/fish/mernst/tmp/python-doc/html/api/front.html",

1572

"/homes/fish/mernst/tmp/python-doc/html/api/fundamental.html",

1573

# "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html",

1574

"/homes/fish/mernst/tmp/python-doc/html/api/importing.html",

1575

"/homes/fish/mernst/tmp/python-doc/html/api/includes.html",

1576

"/homes/fish/mernst/tmp/python-doc/html/api/index.html",

1577

"/homes/fish/mernst/tmp/python-doc/html/api/initialization.html",

1578

"/homes/fish/mernst/tmp/python-doc/html/api/intObjects.html",

1579

"/homes/fish/mernst/tmp/python-doc/html/api/intro.html",

1580

"/homes/fish/mernst/tmp/python-doc/html/api/listObjects.html",

1581

"/homes/fish/mernst/tmp/python-doc/html/api/longObjects.html",

1582

"/homes/fish/mernst/tmp/python-doc/html/api/mapObjects.html",

1583

"/homes/fish/mernst/tmp/python-doc/html/api/mapping.html",

1584

"/homes/fish/mernst/tmp/python-doc/html/api/newTypes.html",

1585

"/homes/fish/mernst/tmp/python-doc/html/api/node24.html",

1586

"/homes/fish/mernst/tmp/python-doc/html/api/noneObject.html",

1587

"/homes/fish/mernst/tmp/python-doc/html/api/number.html",

1588

"/homes/fish/mernst/tmp/python-doc/html/api/numericObjects.html",

1589

"/homes/fish/mernst/tmp/python-doc/html/api/object.html",

1590

"/homes/fish/mernst/tmp/python-doc/html/api/objects.html",

1591

"/homes/fish/mernst/tmp/python-doc/html/api/os.html",

1592

"/homes/fish/mernst/tmp/python-doc/html/api/otherObjects.html",

1593

"/homes/fish/mernst/tmp/python-doc/html/api/processControl.html",

1594

"/homes/fish/mernst/tmp/python-doc/html/api/refcountDetails.html",

1595

"/homes/fish/mernst/tmp/python-doc/html/api/refcounts.html",

1596

"/homes/fish/mernst/tmp/python-doc/html/api/sequence.html",

1597

"/homes/fish/mernst/tmp/python-doc/html/api/sequenceObjects.html",

1598

"/homes/fish/mernst/tmp/python-doc/html/api/standardExceptions.html",

1599

"/homes/fish/mernst/tmp/python-doc/html/api/stringObjects.html",

1600

"/homes/fish/mernst/tmp/python-doc/html/api/threads.html",

1601

"/homes/fish/mernst/tmp/python-doc/html/api/tupleObjects.html",

1602

"/homes/fish/mernst/tmp/python-doc/html/api/typeObjects.html",

1603

"/homes/fish/mernst/tmp/python-doc/html/api/types.html",

1604

"/homes/fish/mernst/tmp/python-doc/html/api/utilities.html",

1605

"/homes/fish/mernst/tmp/python-doc/html/api/veryhigh.html");

1606

for my $file (@files)

1607

{ print STDERR "\n", "=" x 75, "\n", "$file:\n";

1608

process_section_file($file, 0, "Title");

1609

}

1610

}

1611

1612

# Test dealing with index.

1613

elsif ($action eq "index")

1614

{ # my $file;

1615

# $file = "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html";

1616

1617

process_index_file($file, "\@cindex");

1618

print_index_info();

1619

}

1620

1621

else

1622

{ die "Unrecognized action `$action'"; }

1623

}

1624

1625

1626

###########################################################################

1627

### Main loop

1628

###

1629

1630

sub process_contents_file ( $ )

1631

{ my ($file) = check_args(1, @_);

1632

1633

# could also use File::Basename

1634

my $info_file = $file;

1635

$info_file =~ s/(\/?index)?\.html$//;

1636

if ($info_file eq "")

1637

{ chomp($info_file = `pwd`); }

1638

$info_file =~ s/^.*\///; # not the most efficient way to remove dirs

1639

1640

$html_directory = $file;

1641

$html_directory =~ s/(\/|^)[^\/]+$/$1/;

1642

1643

my $texi_file = "$info_file.texi";

1644

open(TEXI, ">$texi_file");

1645

1646

print TEXI "\\input texinfo \@c -*-texinfo-*-\n";

1647

print TEXI "\@c %**start of header\n";

1648

print TEXI "\@setfilename $info_file\n";

1649

1650

# 2. Summary Description and Copyright

1651

# The "Summary Description and Copyright" segment describes the

1652

# document and contains the copyright notice and copying permissions

1653

# for the Info file. The segment must be enclosed between `@ifinfo'

1654

# and `@end ifinfo' commands so that the formatters place it only in

1655

# the Info file.

1656

1657

# The summary description and copyright segment does not appear in the

1658

# printed document.

1659

1660

# @ifinfo

1661

# This is a short example of a complete Texinfo file.

1662

1663

1664

# @end ifinfo

1665

1666

1667

# 3. Title and Copyright

1668

# The "Title and Copyright" segment contains the title and copyright

1669

# pages and copying permissions for the printed manual. The segment

1670

# must be enclosed between `@titlepage' and `@end titlepage'

1671

# commands. The title and copyright page appear only in the printed

1672

# manual.

1673

1674

# The titlepage segment does not appear in the Info file.

1675

1676

# @titlepage

1677

# @sp 10

1678

# @comment The title is printed in a large font.

1679

# @center @titlefont{Sample Title}

1680

1681

# @c The following two commands start the copyright page.

1682

# @page

1683

# @vskip 0pt plus 1filll

1684

1685

# @end titlepage

1686

1687

1688

# 4. `Top' Node and Master Menu

1689

# The "Master Menu" contains a complete menu of all the nodes in the

1690

# whole Info file. It appears only in the Info file, in the `Top'

1691

# node.

1692

1693

# The `Top' node contains the master menu for the Info file. Since a

1694

# printed manual uses a table of contents rather than a menu, the master

1695

# menu appears only in the Info file.

1696

1697

# @node Top, First Chapter, , (dir)

1698

# @comment node-name, next, previous, up

1699

1700

# @menu

1701

# * First Chapter:: The first chapter is the

1702

# only chapter in this sample.

1703

# * Concept Index:: This index has two entries.

1704

# @end menu

1705

1706

1707

1708

$current_ref_tdf = [ "Top", 0, $ARGV[0] ];

1709

process_section_file($file, 0, "Top");

1710

while (scalar(@contents_list))

1711

{ $current_ref_tdf = shift @contents_list;

1712

process_section_file($ {$current_ref_tdf}[2], $ {$current_ref_tdf}[1], $ {$current_ref_tdf}[0]);

1713

}

1714

1715

print TEXI "\n";

1716

for my $indextitle (@index_titles)

1717

{ print TEXI "\@node $indextitle\n";

1718

print TEXI "\@unnumbered $indextitle\n";

1719

print TEXI "\@printindex $ {$index_info{$indextitle}}[1]\n";

1720

print TEXI "\n"; }

1721

1722

print TEXI "\@contents\n";

1723

print TEXI "\@bye\n";

1724

close(TEXI);

1725

}

1726

1727

# This needs to be last so global variable initializations are reached.

1728

1729

if (scalar(@ARGV) == 0)

1730

{ die "No arguments supplied to html2texi.pl"; }

1731

1732

if ($ARGV[0] eq "-test")

1733

{ my @test_args = @ARGV[1..$#ARGV];

1734

if (scalar(@test_args) == 0)

1735

{ test("", "index.html"); }

1736

elsif (scalar(@test_args) == 1)

1737

{ test("", $test_args[0]); }

1738

elsif (scalar(@test_args) == 2)

1739

{ test($test_args[0], $test_args[1]); }

1740

else

1741

{ die "Too many test arguments passed to html2texi: ", join(" ", @ARGV); }

1742

exit();

1743

}

1744

1745

if (scalar(@ARGV) != 1)

1746

{ die "Pass one argument, the main/contents page"; }

1747

1748

process_contents_file($ARGV[0]);

1749

1750

# end of html2texi.pl

Older »