1
#!/usr/local/ls6/bin/perl
4
# ITIID : $ITI$ $Header $__Header$
5
# Author : Ulrich Pfeifer
6
# Created On : Thu Feb 1 13:47:58 1996
7
# Last Modified By: Ulrich Pfeifer
8
# Last Modified On: Thu Feb 1 13:52:38 1996
11
# Status : Unknown, Use with caution!
13
# (C) Copyright 1996, Universit�t Dortmund, all rights reserved.
16
# $Log: English.pm,v $
17
# Revision 1.1.1.1 2003/06/18 17:12:09 upf
18
# perlindex-1.200.tar.gz
20
# Revision 1.1.1.1 1996/02/22 15:43:31 pfeifer
25
package Text::English;
27
$VERSION = $VERSION = '0.01';
34
# Step 0 - remove punctuation
35
s/'s$//; s/^[^a-z]+//; s/[^a-z]+$//;
36
next unless /^[a-z]+$/;
39
if( /[^s]s$/ ) { s/sses$/ss/ || s/ies$/i/ || s/s$// }
41
# step1b_rules. The business with rule==106 is embedded in the
42
# boolean expressions here.
43
(/[aeiouy][^aeiouy].*eed$/ && s/eed$/ee/ ) ||
44
( s/([aeiou].*)ed$/$1/ || s/([aeiouy].*)ing$/$1/ ) &&
46
s/at$/ate/ || s/bl$/ble/ || s/iz$/ize/ || s/bb$/b/ ||
47
s/dd$/d/ || s/ff$/f/ || s/gg$/g/ || s/mm$/m/ ||
48
s/nn$/n/ || s/pp$/p/ || s/rr$/r/ || s/tt$/t/ ||
49
s/ww$/w/ || s/xx$/x/ ||
50
# This is wordsize==1 && CVC...addanE...
51
s/^[^aeiouy]+[aeiouy][^aeiouy]$/$&e/
53
#DEBUG && warn "step1b1: $_\n"
56
#DEBUG warn "step1c: $_\n" if
57
s/([aeiouy].*)y$/$1i/;
61
if ( s/ational$/ate/ || s/tional$/tion/ || s/enci$/ence/ ||
62
s/anci$/ance/ || s/izer$/ize/ || s/iser$/ise/ ||
63
s/abli$/able/ || s/alli$/al/ || s/entli$/ent/ ||
64
s/eli$/e/ || s/ousli$/ous/ || s/ization$/ize/ ||
65
s/isation$/ise/ || s/ation$/ate/ || s/ator$/ate/ ||
66
s/alism$/al/ || s/iveness$/ive/ || s/fulnes$/ful/ ||
67
s/ousness$/ous/ || s/aliti$/al/ || s/iviti$/ive/ ||
71
#DEBUG warn "step 2: l=$l m=$m\n";
72
$_ = $l.$m unless $l =~ /[^aeiou][aeiouy]/;
75
if ( s/icate$/ic/ || s/ative$// || s/alize$/al/ ||
76
s/iciti$/ic/ || s/ical$/ic/ || s/ful$// ||
80
#DEBUG warn "step 3: l=$l m=$m\n";
81
$_ = $l.$m unless $l =~ /[^aeiou][aeiouy]/;
85
if ( s/al$// || s/ance$// || s/ence$// || s/er$// ||
86
s/ic$// || s/able$// || s/ible$// || s/ant$// ||
87
s/ement$// || s/ment$// || s/ent$// || s/sion$/s/ ||
88
s/tion$/t/ || s/ou$// || s/ism$// || s/ate$// ||
89
s/iti$// || s/ous$// || s/ive$// || s/ize$// ||
93
# Look for two consonant/vowel transitions
95
#DEBUG warn "step 4: l=$l m=$m\n";
96
$_ = $l.$m unless $l =~ /[^aeiou][aeiouy].*[^aeiou][aeiouy]/;
100
#DEBUG warn("step 5a: $_\n") &&
101
s/e$// if ( /[^aeiou][aeiouy].*[^aeiou][aeiouy].*e$/ ||
102
( /[aeiou][^aeiouy].*e/ && ! /[^aeiou][aeiouy][^aeiouwxy]e$/) );
105
#DEBUG warn("step 5b: $_\n") &&
106
s/ll$/l/ if /[^aeiou][aeiouy].*[^aeiou][aeiouy].*ll$/;
120
Text::English - Porter's stemming algorithm
125
@stems = Text::English::stem( @words );
129
This routine applies the Porter Stemming Algorithm to its parameters,
130
returning the stemmed words.
131
It is derived from the C program "stemmer.c"
132
as found in freewais and elsewhere, which contains these notes:
134
Purpose: Implementation of the Porter stemming algorithm documented
135
in: Porter, M.F., "An Algorithm For Suffix Stripping,"
136
Program 14 (3), July 1980, pp. 130-137.
137
Provenance: Written by B. Frakes and C. Cox, 1986.
139
I have re-interpreted areas that use Frakes and Cox's "WordSize"
140
function. My version may misbehave on short words starting with "y",
141
but I can't think of any examples.
143
The step numbers correspond to Frakes and Cox, and are probably in
144
Porter's article (which I've not seen).
145
Porter's algorithm still has rough spots (e.g current/currency, -ings words),
146
which I've not attempted to cure, although I have added
147
support for the British -ise suffix.
151
This is version 0.1. I would welcome feedback, especially improvements
152
to the punctuation-stripping step.
156
Ian Phillipps <ian@unipalm.pipex.com>
160
Copyright Public IP Exchange Ltd (PIPEX).
161
Available for use under the same terms as perl.