3
# Purpose: A light English stemmer
4
# Author: Xiaoyi Ma, LDC
5
# Date: September 17, 2003
7
# Output: English text with words stemmed
9
$| = 1; # disable Perl output buffering
11
## Modified 2012-01-22:
12
# Changed morph file to parameter not environment variable
13
# Added binmode for utf8 STDIN and STDOUT
14
## Modified 2012-01-22:
17
#binmode(STDIN, ":utf8");
18
#binmode(STDOUT, ":utf8");
21
use Fcntl qw(O_RDONLY O_RDWR O_CREAT);
23
my $eng_morph_txt = "./morph.en";
27
/^-m$/ && ($eng_morph_txt = shift, next);
28
/^-h$/ && ($HELP = 1, next);
32
print "Usage ./english_stemmer.pl ( -m eng_morph_txt ) < textfilein > stemmedfileout\n";
36
$eng_morph = "$eng_morph_txt.db";
37
&make_eng_morph_db($eng_morph_txt, $eng_morph);
38
tie %eng_morph, "DB_File", $eng_morph,O_RDONLY,0444 || die "$0: Cannot open dbmfile $eng_morph!\n";
41
if (/<seg id=(\d+)>(.*)<\/seg>/) {
42
$segid = $1; $seg = $2;
47
if (defined $eng_morph{$_}) {
48
$stemmed_seg .= "$eng_morph{$_} ";
50
$stemmed_seg .= "$_ ";
53
print "<seg id=$segid>$stemmed_seg</seg>\n";
58
if (defined $eng_morph{$_}) {
59
$stemmed_seg .= "$eng_morph{$_} ";
61
$stemmed_seg .= "$_ ";
64
print "$stemmed_seg\n";
69
sub make_eng_morph_db {
70
my ($eng_morph_txt, $eng_morph) = @_;
72
return if -f $eng_morph;
74
print STDERR "Making English Morph DBM file ...\n";
75
tie %eng_morph, "DB_File", $eng_morph, O_CREAT|O_RDWR, 0664|| die "Cannot open dbmfile $eng_morph";
76
open F,"<$eng_morph_txt" || die "English Morph file $eng_morph not found";
80
$eng_morph{$_[0]} = $_[1];