~jtv/corpusfiltergraph/cross-python

« back to all changes in this revision

Viewing changes to trunk/lib/corpusfg/graphs/sa-champollion/bin/gtoken.pl

  • Committer: tahoar
  • Date: 2012-05-02 15:46:23 UTC
  • Revision ID: svn-v4:bc069b21-dff4-4e29-a776-06a4e04bad4e::266
new layout. need to update code to use the new layout

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
#!/usr/bin/perl -n
 
2
$| = 1; # disable Perl output buffering
 
3
 
 
4
chomp;
 
5
 
 
6
# put space after any period that's followed by a non-number
 
7
s/\.(\D)/\. $1/g;
 
8
# put space before any period that's followed by a space
 
9
# the following space is introduced in the previous command
 
10
s/\. / \. /g;
 
11
 
 
12
# put space around colons and comas, unless they're surrounded by numbers
 
13
s/(\d)\.(\d)/$1DOTTKN$2/g;
 
14
s/(\d)\:(\d)/$1COLONTKN$2/g;
 
15
s/(\d)\,(\d)/$1COMATKN$2/g;
 
16
 
 
17
# lack of knowledge of the encoding of the subject text
 
18
# the tokenizer will not try to put spaces around puctuations
 
19
# s/\"|\'|\.|\*|\!|\t|\(|\)|\[|\]|\{|\}|\,|\\|\/|\#|\$|\^|\%|\&|\-|\+|\;|\=|\_|\~|\||\?/ $& /g;
 
20
 
 
21
s/(\d)DOTTKN(\d)/$1\.$2/g;
 
22
s/(\d)COLONTKN(\d)/$1\:$2/g;
 
23
s/(\d)COMATKN(\d)/$1\,$2/g;
 
24
 
 
25
print "$_\n";