2
$| = 1; # disable Perl output buffering
6
# put space after any period that's followed by a non-number
8
# put space before any period that's followed by a space
9
# the following space is introduced in the previous command
12
# put space around colons and comas, unless they're surrounded by numbers
13
s/(\d)\.(\d)/$1DOTTKN$2/g;
14
s/(\d)\:(\d)/$1COLONTKN$2/g;
15
s/(\d)\,(\d)/$1COMATKN$2/g;
17
# lack of knowledge of the encoding of the subject text
18
# the tokenizer will not try to put spaces around puctuations
19
# s/\"|\'|\.|\*|\!|\t|\(|\)|\[|\]|\{|\}|\,|\\|\/|\#|\$|\^|\%|\&|\-|\+|\;|\=|\_|\~|\||\?/ $& /g;
21
s/(\d)DOTTKN(\d)/$1\.$2/g;
22
s/(\d)COLONTKN(\d)/$1\:$2/g;
23
s/(\d)COMATKN(\d)/$1\,$2/g;