~jtv/corpusfiltergraph/cross-python

« back to all changes in this revision

Viewing changes to trunk/lib/corpusfg/graphs/sa-champollion/en/etoken.pl

  • Committer: tahoar
  • Date: 2012-05-02 15:46:23 UTC
  • Revision ID: svn-v4:bc069b21-dff4-4e29-a776-06a4e04bad4e::266
new layout. need to update code to use the new layout

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
#!/usr/bin/perl -n
 
2
$| = 1; # disable Perl output buffering
 
3
 
 
4
binmode(STDIN, ":utf8");
 
5
binmode(STDOUT, ":utf8");
 
6
 
 
7
use strict;
 
8
 
 
9
my $segid;
 
10
my $seg;
 
11
 
 
12
if (/<seg id=(\d+)>(.*)<\/seg>/) {
 
13
    $segid = $1; $seg = $2;
 
14
 
 
15
} else {
 
16
    chomp;
 
17
    $segid = "";
 
18
    $seg = $_;
 
19
}
 
20
 
 
21
# put space after any period that's followed by a non-number
 
22
$seg =~ s/\.(\D)/\. $1/g;
 
23
# put space before any period that's followed by a space
 
24
# the following space is introduced in the previous command
 
25
$seg =~ s/\. / \. /g;
 
26
 
 
27
# put space around colons and comas, unless they're surrounded by numbers
 
28
$seg =~ s/(\d)\.(\d)/$1DOTTKN$2/g;
 
29
$seg =~ s/(\d)\:(\d)/$1COLONTKN$2/g;
 
30
$seg =~ s/(\d)\,(\d)/$1COMATKN$2/g;
 
31
 
 
32
$seg =~ s/\W/ $& /g;
 
33
 
 
34
$seg =~ s/(\d)DOTTKN(\d)/$1\.$2/g;
 
35
$seg =~ s/(\d)COLONTKN(\d)/$1\:$2/g;
 
36
#$seg =~ s/(\d)COMATKN(\d)/$1\,$2/g;
 
37
$seg =~ s/(\d)COMATKN(\d)/$1$2/g;
 
38
$seg =~ s/([a-zA-Z])(\d)/$1 $2/g;
 
39
$seg =~ s/(\d)([a-zA-Z])/$1 $2/g;
 
40
 
 
41
if ($segid ne "") {
 
42
    print "<seg id=$segid>$seg</seg>\n";
 
43
} else {
 
44
    print "$seg\n";
 
45
}
 
46