~ubuntu-branches/ubuntu/lucid/dblatex/lucid : contents of scripts/texclean at revision 1

~ubuntu-branches/ubuntu/lucid/dblatex/lucid : (revision 1)
#!/usr/bin/env perl
    eval 'exec /usr/bin/env perl -S $0 ${1+"$@"}'
        if $running_under_some_shell;

use File::Basename;
use Getopt::Std;

#
# Formattage des tableaux, pour gérer les cellules sur plusieurs lignes
# use table;
#
$row = "";
@mrows = ();
@sizes = ();
@aligs = ();
$nbcols = 0;

$force_hyphen = 1;
$istable = 0;
$tblparse = 0;

#
# Table header parsed to know the column sizes and alignments
#
sub prepare_table
{
  local($line) = $_[0];
  my $i = 0;
  my @a = ();
  
  @a = split(/(p{[^}]*})/, $line);
  foreach (@a) {
    if (/p{/) {
      ($sizes[$i] = $_) =~ s/p{(.*)}/$1/;
      $i++;
    } else {
      ($aligs[$i] = $_) =~ s/.*{(.[^}]*)}$/$1/;
    }
  }
  $nbcols = $i;
}

sub hline2cline
{
  local($row) = $_[0];
  local($ncols) = $_[1];
  my $i;
  my $first = 1;
  my $last = 1;
  my $pl = "";
  my $bord = 0;
  my $prow = "";
  my $erow = "";

  my @u;
  $row =~ s/\\hline/<hline>/;
  $row =~ s/\\tabularnewline/<tabularnewline>/;
  @u = split("<hline>", $row);
  if ($#u > 0) {
    $prow = $u[0];
    $prow .= "\\hline";
    $row = $u[1];
  }
  @u = split("<tabularnewline>", $row);
  if ($#u > 0) {
    $row = $u[0];
    $erow = "\\tabularnewline";
    $erow .= $u[1];
  }

  # dump_mrows($ncols);
  for($i = 0; $i < $ncols; $i++) {
    # print $to "mr($i)=".$mrows[$i];
    if ($mrows[$i] <= 0) {
      # on mémorise la ligne à tracer
      $bord = 1;
      $last = $i+1;
    } else {
      # on trace la ligne jusqu'à cette cellule
      if ($bord) {
        $pl = "$pl\\cline{$first-$last}";
        $bord = 0;
      }
      $first = $i+2;
    }
  }
  if ($bord && (($first != 1) || ($last != $ncols))) {
    $pl = "$pl\\cline{$first-$last}";
  }
  if ($pl ne "") {
    $prow =~ s/\\hline/$pl/;
  }
  return ($prow, $row, $erow);
}

#
# Main parsing table routine. The principle is to process each table row
# when rows are complete (a row can cover several lines).
#
sub table_parse
{
  local($line) = $_[0];
  local($to) = $_[1];
  my $i, $pos;
  my @columns = ();

  if (/%%% parse_table/) {
    # note that the following table will need to be parsed
    $tblparse = 1;
    # print "Parsing table\n";
  }
  if (/begin{longtable}/) {
    $istable = 1;
    if ($tblparse) {
      prepare_table($_);
    }
  }
  if (not($istable)) {
    print $to $line;
    return;
  }
  if (/end{longtable}/) {
    #
    # table ends, flush the current row
    #
    print $to "$row$line";
    $istable = 0;
    $tblparse = 0;
    $row = "";
    $nbcols = 0;
    return;
  }
  # if no parsing, just print the line
  if (not($tblparse)) {
    print $to $line;
    return;
  }
  
  #
  # when parsing, only process a complete row
  #
  $row = "$row$line";
  if (not(/\\tabularnewline/)) {
    return;
  }
  #
  # for the previous row, print borders (depends on pending multirow cells)
  #
  ($brow, $row, $erow) = hline2cline($row, $nbcols);
  @columns = split('&', $row);
  $nbcols = ($#columns > $nbcols) ? $#columns : $nbcols;

  #
  # for each column, for the current row:
  # - insert the empty cells due to pending mrow cells,
  # - update the pending mrow cells count,
  # - setup the new pending mrow cells,
  # - print the "multirow" cells
  #
  print $to $brow;

  for ($i=0, $pos=0; $pos<$nbcols; $pos++) {
    $c = $columns[$i];

    if ($mrows[$pos] > 0) {
      #
      # insert an empty cell, and update the pending mrow cells for this
      # column.
      #
      print $to " & ";
      $mrows[$pos] --;
    } elsif ($i > $#columns) {
      # print the missing ending cells
      print $to " & ";
    } else {
      # shift to the real position
      if ($c =~ /%<num=/) {
        $rpos = $c;
        $rpos =~ s/[^%]*%<num=([^>]*).*\n/$1/;
        $rpos -= 1;
        # skip the info
        $c =~ s/%<num=.*>%\n//;
        while ($rpos > $pos) {
          print $to " & ";
          $pos++;
          if ($mrows[$pos] > 0) {
            $mrows[$pos] --;
          }
        }
      }
      if ($c =~ /multirow/) {
        # set the pending mrows cells count
        $mrows[$pos] = $c;
        $mrows[$pos] =~ s/\n//g;
        $mrows[$pos] =~ s/.*multirow{([^}]*)}.*/$1/;
        $mrows[$pos] -= 1;
        #
        # set the column size and its alignment (which is lost by
        # multirow that aligns to left by default)
        #
        $s = $sizes[$pos];
        $a = $aligs[$pos];
        $c =~ s/\*{/{$s}{$a /;
        # macro to use instead of multirow, to be compatible with \tsize
        $c =~ s/multirow/mrow/;
      }
      # shift the multicolums
      if ($c =~ /multicolumn/) {
        $mcols = $c;
        $mcols =~ s/\n//g;
        $mcols =~ s/.*multicolumn{([^}]*)}.*/$1/g;
        $pos += ($mcols - 1);
      }
      print $to $c;
      if ($i < $#columns) {
        # print the separator
        print $to " & ";
      }
      $i ++;
    }
  }
  print $to $erow;
  $row = "";
}

$keyon = '\\\\xt';
$keyoff = '/xt';

# Known embedded commands
$do_hyphen = 0;

sub command_update
{
  local($line) = $_[0];
  my $cmd;

  if ($line =~ /%% texclean\(/) {
    ($cmd = $line) =~ s/.*texclean\(([^\)]*)\).*\n/$1/;
    if ($cmd eq "hyphenon") {
      $do_hyphen = 1;
    } elsif ($cmd eq "hyphenoff") {
      $do_hyphen = 0;
    }
  }
}

#
# Insert <key> between each letter of <line>, but prevents from
# cutting entities
#
sub line_cut
{
  local($line) = $_[0];
  local($key) = $_[1];
  my $l;
  my $nline = "";
  my @a;

  @a = split(/(&#[^;]*;)/, $line);
  foreach (@a) {
    if (/&#/) {
      # Entities are untouched
      $nline .= $_;
    } else {
      # Two passes, to cut every letter (not every two letters)
      $l = $_;
      $l =~ s/([^ \t])([^ \t\n])/$1$key$2/g;
      $l =~ s/$key([^ \t])([^ \t\n])/$key$1$key$2/g;
      $nline .= $l;
    }
  }
  return $nline;
}

sub translate
{
  local($line) = $_[0];
  my $hyphenize = 0;

  if (($istable && $force_hyphen) || $do_hyphen) {
    $hyphenize = 1;
  }

  if ($hyphenize) {
    $line = line_cut($line, "<cut>");
  }
  $line =~ s/^[\s\n]*$/ /g;
  $line =~ s/\\/\\textbackslash/g;
  $line =~ s/_/\\_/g;
  $line =~ s/{/\\{/g;
  $line =~ s/}/\\}/g;
  $line =~ s/%/\\%/g;
  $line =~ s/\^/\\\^{}/g;
  $line =~ s/&#732;/\\textasciitilde{}/g;
  $line =~ s/&#8211;/\\textendash{}/g;
  $line =~ s/&#8212;/\\textemdash{}/g;
  $line =~ s/&#8220;/{}``/g;
  $line =~ s/&#8221;/{}''/g;
  $line =~ s/&#x2DC;/\\textasciitilde{}/g;
  $line =~ s/&#x2013;/\\textendash{}/g;
  $line =~ s/&#x2014;/\\textemdash{}/g;
  $line =~ s/&#x201C;/{}``/g;
  $line =~ s/&#x201D;/{}''/g;
  $line =~ s/&/\\&/g;
  $line =~ s/#/\\#/g;
  $line =~ s/\$/\\\$/g;
  $line =~ s/\240/~/g;
  $line =~ s/\xb1/\\ensuremath{\\pm}/g;
  $line =~ s/\\textbackslash/\\textbackslash{}/g;
  $line =~ s/\327/\$\\times\$/g;
  $line =~ s/°/\\ensuremath{°}/g;
  $line =~ s/-/-{}/g;

  # force hyphenation if asked
  if ($hyphenize) {
    $line =~ s/<cut>/\\-/g;
  }
  return $line;
}

$figcount = 0;
%figdone = ();

sub eps2xxx
{
  local($in) = $_[0];
  local($out) = $_[1];
  local($format) = $_[2];

  my $action = "";
  for ($format) { 
    /pdf/ && do { $action = "epstopdf --outfile=$out $in"; last; };
    /png/ && do { $action = "convert $in $out"; last; };
  }
  return $action;
}

sub gif2xxx
{
  local($in) = $_[0];
  local($out) = $_[1];
  local($format) = $_[2];

  my $action = "convert $in $out";
  return $action;
}

sub scanformat
{
  my $f = $_[0];
  my @formats = ();
  my $ext = "";

  # Is there a suffix?
  ($file,$p,$ext) = fileparse($f, '\..*');

  # The prefered format depends on the expected output
  if ($figout eq "eps") {
    @formats = (".eps", ".fig", ".pdf", ".png", "gif", "");
  } else {
    @formats = (".pdf", ".png", ".eps", "gif", ".fig", "");
  }

  if ($ext ne "") {
    $fig = $f;
    if (not(-f $fig))  {
      $fig = "$path/$fig";
    }
  } else {
    print "Fig format scanning... ";
    # Look for the missing format
    LOOKUP: {
      foreach $e (@formats) {
        $ext = $e;
        if (-f "$f$ext") {
          $fig = "$f$ext";
          last LOOKUP;
        } elsif (-f "$path/$f$ext") {
          $fig = "$path/$f$ext";
          last LOOKUP;
        }
      }
    }
    print "found $ext\n";
  }
  $ext =~ s/\.//;
  return ($fig, $ext);
}

sub figconvert
{
  local($line) = $_[0];
  my $dofig = "";
  my $figcmd = "";

  # Is there a graphic included here
  if (/\\includegraphics[\[{]/) {
    $figcmd = "\\includegraphics" ;
  } elsif (/\\begin{overpic}/) {
    $figcmd = "\\begin{overpic}" ;
  } elsif (/\\imgexists{/) {
    $figcmd = "\\imgexists" ;
  } else {
    return $line;
  }

  ($f = $line) =~ s/.*$figcmd[^{]*{([^}]*)}.*\n/$1/;

  # Get the full filename and the suffix
  ($fig, $ext) = scanformat($f);

  # If no suffix, use the default one
  if ($ext eq "") {
    print "Use default figure format\n";
    $ext = $figin;
  }

  # Check if this figure has been already converted
  if (exists $figdone{"$fig"}) {
    $newfig = $figdone{"$fig"};
    print "already done with $newfig\n";
    $line =~ s/$f/$newfig/g;

  # Convert the figure
  } elsif (-f $fig) {
    $fig2dev = $ext."2".$figout;
    $newfig = "fig".$figcount++.".$figout";
    for ($fig2dev) {
      /fig2eps/ && do{
        $dofig = "fig2dev -L $figout $fig > $newfig"; last;
      };
      (/fig2pdf/ || /fig2png/) && do{
        $dofig = "fig2dev -L eps $fig > tmp_fig.eps";
        $dofig .= "; ".eps2xxx("tmp_fig.eps", $newfig, $figout);
        last;
      };
      /eps2pdf/ && do{
        $dofig = eps2xxx($fig, $newfig, $figout); last;
      };
      /gif2/ && do{
        $dofig = gif2xxx($fig, $newfig, $figout); last;
      };
    }
    if ($dofig ne "") {
      $figdone{"$fig"} = $newfig;
      print "$dofig\n";
      system("$dofig");
      $line =~ s/$f/$newfig/g;
    }
  }
  return $line
}

sub parse_inlined
{
  local($line) = $_[0];
  local($lmode) = $_[1];
  my @texts = split("$keyon ", $line);
  my $mode = 0;
  my $nline = "";
  my $l;

  for ($i = 0; $i <= $#texts; $i++) {
    $l = $texts[$i];
    @blks = split("$keyoff ", $l);

    # Special case of the last /xt to remove, that is not followed by a space
    if ($#blks == 0) { @blks = split($keyoff, $l); }
    
    if ($i > 0) { $mode = 1; } else { $mode = $lmode }

    # The first part to convert
    if ($mode == 1) {
      $blks[0] = translate($blks[0]);
    }
    # The second part to convert if global mode set
    if ($#blks > 0 and $lmode == 1) {
      $blks[1] = translate($blks[1]);
    }
    $nline = "$nline$blks[0]";
    if ($#blks > 0) {
      $mode = 0;
      $nline = "$nline$blks[1]";
    }
  }
  return $nline;
}

sub parse_sgml
{
  local($rawtex) = $_[0];
  local($cleantex) = $_[1];
  my $mode = 0;
  my $next_mode = 0;
  my $line = "";
  my $file = "";
  my $RTEX = "f$rawtex";
  my $CTEX = "f$cleantex";

  print "$rawtex -> $cleantex\n";

  if (-f $cleantex) {
#    print "***Warning: $cleantex already exists\n";
    system("mv $cleantex $cleantex~");
  }

  open($RTEX, "<$rawtex") || die "Cannot open $rawtex\n";
  open($CTEX, ">$cleantex") || die "Cannot open $cleantex\n";

  while (<$RTEX>) {
    $line = $_;

    # Convert the figures if needed
    $line = figconvert($line) if ($figout ne "");

    # Update the embedded commands
    command_update($line);

    # Update the translation mode for the next line
    if ($line =~ /$keyon\n/) {
      $line =~ s/$keyon\n//;
      $next_mode = 1;
      chomp $line;
    } elsif ($line =~ /$keyoff\n/) {
      $line =~ s/$keyoff\n/\n/;
      $next_mode = 0;
      chomp $line;
    } else {
      $next_mode = $mode;
    }

    # Translate the current line if necessary
    if ($line =~ /$keyon /) {
      # More tricky, 'xt' is in the line
      $line = parse_inlined($line, $mode);
    } elsif ($mode == 1) {
      # Translate the entire line
      $line = translate($line);
    }
    $mode = $next_mode;

    table_parse($line, $CTEX);
  }
  close($RTEX);
  close($CTEX);
}

$backend = "dvips";

getopts("f:p:b:");

if ($opt_b) {
  $backend = $opt_b;
}
# Default input figure format
if ($opt_f) {
  $figin = $opt_f;
}
if ($opt_p) {
  $path = $opt_p;
}

# The expected output figure format depends on the backend driver
for ($backend) {
  /dvips/ && do{ $figout = "eps"; last; };
  /pdftex/ && do{ $figout = "pdf"; last; };
  /none/ && do{ $figout = ""; last };
}

$rawtex = $ARGV[0];
$cleantex = basename($rawtex, '.tex');
$cleantex = dirname($rawtex). "/${cleantex}_c.tex";
shift;

if (@ARGV) {
  $cleantex = $ARGV[0];
}

parse_sgml($rawtex, $cleantex, 0);