2
##################################################
4
# Light Stemmer for Hindi
6
# Modified by Xiaoyi Ma 6/6/2003
7
# Created by Kareem Darwish
8
# Last Modified June 4, 2003
11
##################################################
13
# Based on the suffix list provided by A. Ramanathan
14
# and D. Rao in their paper entitled
15
# "A Lightweight Stemmer for Hindi"
17
# usage: Reads STDIN, outputs to STDOUT
20
##################################################
21
$| = 1; # disable Perl output buffering
25
$Usage = "Usage: $0 [filename]\n";
26
die $Usage if ( @ARGV > 1 or ( @ARGV == 1 and ! -f $ARGV[0] ));
31
binmode STDIN, ":utf8";
32
binmode STDOUT, ":utf8";
34
# sort by string length
35
sub lengthly {length $b <=> length $a };
38
$suffixes = "\x{0906} \x{0907} \x{0908} \x{0909} \x{090a} \x{090f} \x{0913} \x{090f}\x{0902} \x{0913}\x{0902} \x{0906}\x{0902} \x{0909}\x{0906}\x{0902} \x{0909}\x{090f}\x{0902} \x{0909}\x{0913}\x{0902} \x{0906}\x{090f}\x{0902} \x{0906}\x{0913}\x{0902} \x{0907}\x{092f}\x{093e}\x{0905}\x{0902} \x{0907}\x{092f}\x{0913}\x{0902} \x{0906}\x{0907}\x{092f}\x{093e}\x{0905}\x{0902} \x{0906}\x{0902}\x{0939} \x{0906}\x{0907}\x{092f}\x{0913}\x{0902} \x{0907}\x{092f}\x{093e}\x{0905}\x{0902}\x{0939} \x{0906}\x{0907}\x{092f}\x{093e}\x{0905}\x{0902}\x{0939} \x{0905}\x{0924}\x{093e}\x{0905}\x{090f}\x{0902} \x{0905}\x{0924}\x{093e}\x{0905}\x{0913}\x{0902} \x{0905}\x{0928}\x{093e}\x{0905}\x{090f}\x{0902} \x{0905}\x{0928}\x{093e}\x{0905}\x{0913}\x{0902} \x{0905}\x{0924}\x{093e}\x{0905} \x{0905}\x{0924}\x{0948}\x{0907} \x{0908}\x{0902} \x{0905}\x{0924}\x{0948}\x{0907}\x{0902} \x{0905}\x{0924}\x{090f} \x{0906}\x{0924}\x{093e}\x{0905} \x{0906}\x{0924}\x{0948}\x{0907} \x{0906}\x{0924}\x{0948}\x{0907}\x{0902} \x{0906}\x{0924}\x{090f} \x{0905}\x{0928}\x{093e}\x{0905} \x{0905}\x{0928}\x{0948}\x{0907} \x{0905}\x{0928}\x{090f} \x{0906}\x{0928}\x{093e}\x{0905} \x{0906}\x{0928}\x{090f} \x{090a}\x{0902}\x{0917}\x{093e}\x{0905} \x{090a}\x{0902}\x{0917}\x{0948}\x{0907} \x{0906}\x{090a}\x{0902}\x{0917}\x{093e}\x{0905} \x{0906}\x{090a}\x{0902}\x{0917}\x{0948}\x{0907} \x{090f}\x{0902}\x{0917}\x{090f} \x{090f}\x{0902}\x{0917}\x{0948}\x{0907} \x{0906}\x{090f}\x{0902}\x{0917}\x{090f} \x{0906}\x{090f}\x{0902}\x{0917}\x{0948}\x{0907} \x{0913}\x{0917}\x{090f} \x{0913}\x{0917}\x{0948}\x{0907} \x{0906}\x{0913}\x{0917}\x{090f} \x{0906}\x{0913}\x{0917}\x{0948}\x{0907} \x{090f}\x{0917}\x{093e}\x{0905} \x{090f}\x{0917}\x{0948}\x{0907} \x{0906}\x{090f}\x{0917}\x{093e}\x{0905} \x{0906}\x{090f}\x{0917}\x{0948}\x{0907} \x{0906}\x{092f}\x{093e}\x{0905} \x{0906}\x{090f} \x{0906}\x{0908} \x{0906}\x{0908}\x{0902} \x{0907}\x{090f} \x{0906}\x{0913} \x{0906}\x{0907}\x{090f} \x{0905}\x{0915}\x{093e}\x{0930}\x{093e} \x{0906}\x{0915}\x{093e}\x{0930}\x{093e}";
40
@s = sort lengthly ( split ' ', $suffixes );
43
# candidate stems -- the code eventually picks the shortest one at the end
47
if ($w =~ /(.+?)($stem)$/) {