5
# This is Kareem Darwish's stem_cp1256.pl modified by
6
# Leah Larkey, Alexander Fraser and Xiaoyi Ma
8
## Modified 2010-07-31:
9
# Changed morph file to parameter not environment variable
10
# Added binmode for utf8 STDIN and STDOUT
12
binmode(STDIN, ":utf8");
13
binmode(STDOUT, ":utf8");
15
## Modified 2012-01-08:
16
# Added $| = 1 to disable Perl buffering
19
my $atb_stems = "morph.ar";
23
/^-m$/ && ($atb_stems = shift, next);
28
open S, "<$atb_stems" || die "$0: Cannot open $atb_stems\n";
31
if (/^(.+)\s+(.+)$/) {
39
# split on spaces and punctuation
43
# split on spaces since tokenization was done by atoken.pl
44
my @tokens = split ' ', $_;
45
for my $token (@tokens) {
46
# remove all non-letters (diacritics, punctuation)
48
while ($token =~ /\G.*?((ء|آ|أ|ؤ|إ|ئ|ا|ب|ة|ت|ث|ج|ح|خ|د|ذ|ر|ز|س|ش|ص|ض|ط|ظ|ع|غ|ف|ق|ك|ل|م|ن|ه|و|ي|ى|[\x21-\x7E])+)/g) {
53
# normalize ya and Alef Maqsoura
56
# normalizing different alef-maad, alef-hamza-top,
57
# alef-hamza-bottom, bare-alef you can choose between light and
58
# aggressive normalization. The default is aggressive.
61
# $token =~ s/(آ|أ|إ)/ا/g;
62
# aggressive normalization
63
$token =~ s/(ء|آ|أ|ؤ|إ|ئ)/ا/g;
65
if (exists $stem{$token}) {
66
print "$stem{$token} ";
69
# print STDERR "$token\n";
73
# this regexp will match every string. It tries to take the longest
74
# possible prefix and suffix. $2 will always be defined but can be empty.
75
if ($token =~ /^(وال|فال|بال|بت|يت|لت|مت|تت|وت|ست|نت|بم|لم|وم|كم|فم|ال|لل|وي|لي|سي|في|وا|فا|لا|با)(.+)$/) {
78
while ($token =~ /^(.+)(ات|وا|تا|ون|وه|ان|تي|ته|تم|كم|هن|هم|ها|ية|تك|نا|ين|يه|ة|ه|ي|ا)$/) {
81
# if ($token =~ /^(وال|فال|بال|بت|يت|لت|مت|تت|وت|ست|نت|بم|لم|وم|كم|فم|ال|لل|وي|لي|سي|في|وا|فا|لا|با|)(.+?)(ات|وا|تا|ون|وه|ان|تي|ته|تم|كم|هن|هم|ها|ية|تك|نا|ين|يه|ة|ه|ي|ا)$/) {
91
## Saved for possible future use
92
## remove diacritics and kashida
93
#s/(ً|ٌ|ٍ|َ|ُ|ِ|ّ|ْ|ـ)//g;