3
# Purpose: tokenize hindi text, put spaces around numbers,
4
# ascii punctuations, and hindi end of sentence marks.
5
# Can be piped into another process to split at spaces
6
# to get a token array.
7
# Author : Xiaoyi Ma, 6/5/2003
8
$| = 1; # disable Perl output buffering
13
#$Usage = "Usage: $0 [filename]\n";
14
#die $Usage if ( @ARGV > 1 or ( @ARGV == 1 and ! -f $ARGV[0] ));
20
binmode STDIN, ":utf8";
21
binmode STDOUT, ":utf8";
24
# put space after any period that's followed by a non-number
26
# put space before any period that's followed by a space
27
# the following space is introduced in the previous command
30
# put space around colons and comas, unless they're surrounded by numbers
31
s/([0-9])\:([0-9])/$1<CLTKN>$2/g;
33
s/([0-9])<CLTKN>([0-9])/$1\:$2/g;
34
s/([0-9])\,([0-9])/$1<CMTKN>$2/g;
36
s/([0-9])<CMTKN>([0-9])/$1\,$2/g;
39
# put space around any other punctuation
49
# put spaces around special symbols
66
# put spaces around hindi end of sentence marks
67
s/\x{0964}|\x{0965}/ $& /g;