3
"""Porter Stemming Algorithm
4
This is the Porter stemming algorithm, ported to Python from the
5
version coded up in ANSI C by the author. It may be be regarded
6
as canonical, in that it follows the algorithm presented in
8
Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
11
only differing from it at the points maked --DEPARTURE-- below.
13
See also http://www.tartarus.org/~martin/PorterStemmer
15
The algorithm as described in the paper could be exactly replicated
16
by adjusting the points of DEPARTURE, but this is barely necessary,
17
because (a) the points of DEPARTURE are definitely improvements, and
18
(b) no encoding of the Porter stemmer I have seen is anything like
19
as exact as this version, even with the points of DEPARTURE!
21
Vivake Gupta (v@nano.com)
23
Release 1: January 2001
25
Further adjustments by Santiago Bruno (bananabruno@gmail.com)
26
to allow word input not restricted to one word per line, leading
37
"""The main part of the stemming algorithm starts here.
38
b is a buffer holding a word to be stemmed. The letters are in b[k0],
39
b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
40
readjusted downwards as the stemming progresses. Zero termination is
41
not in fact used in the algorithm.
43
Note that only lower case sequences are stemmed. Forcing to lower case
44
should be done before stem(...) is called.
47
self.b = "" # buffer for word to be stemmed
50
self.j = 0 # j is a general offset into the string
53
"""cons(i) is TRUE <=> b[i] is a consonant."""
54
if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u':
60
return (not self.cons(i - 1))
64
"""m() measures the number of consonant sequences between k0 and j.
65
if c is a consonant sequence and v a vowel sequence, and <..>
66
indicates arbitrary presence,
100
def vowelinstem(self):
101
"""vowelinstem() is TRUE <=> k0,...j contains a vowel"""
102
for i in range(self.k0, self.j + 1):
107
def doublec(self, j):
108
"""doublec(j) is TRUE <=> j,(j-1) contain a double consonant."""
109
if j < (self.k0 + 1):
111
if (self.b[j] != self.b[j-1]):
116
"""cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
117
and also if the second c is not w,x or y. this is used when trying to
118
restore an e at the end of a short e.g.
120
cav(e), lov(e), hop(e), crim(e), but
123
if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) or not self.cons(i-2):
126
if ch == 'w' or ch == 'x' or ch == 'y':
131
"""ends(s) is TRUE <=> k0,...k ends with the string s."""
133
if s[length - 1] != self.b[self.k]: # tiny speed-up
135
if length > (self.k - self.k0 + 1):
137
if self.b[self.k-length+1:self.k+1] != s:
139
self.j = self.k - length
143
"""setto(s) sets (j+1),...k to the characters in the string s, readjusting k."""
145
self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:]
146
self.k = self.j + length
149
"""r(s) is used further down."""
154
"""step1ab() gets rid of plurals and -ed or -ing. e.g.
174
if self.b[self.k] == 's':
175
if self.ends("sses"):
177
elif self.ends("ies"):
179
elif self.b[self.k - 1] != 's':
184
elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
186
if self.ends("at"): self.setto("ate")
187
elif self.ends("bl"): self.setto("ble")
188
elif self.ends("iz"): self.setto("ize")
189
elif self.doublec(self.k):
192
if ch == 'l' or ch == 's' or ch == 'z':
194
elif (self.m() == 1 and self.cvc(self.k)):
198
"""step1c() turns terminal y to i when there is another vowel in the stem."""
199
if (self.ends("y") and self.vowelinstem()):
200
self.b = self.b[:self.k] + 'i' + self.b[self.k+1:]
203
"""step2() maps double suffices to single ones.
204
so -ization ( = -ize plus -ation) maps to -ize etc. note that the
205
string before the suffix must give m() > 0.
207
if self.b[self.k - 1] == 'a':
208
if self.ends("ational"): self.r("ate")
209
elif self.ends("tional"): self.r("tion")
210
elif self.b[self.k - 1] == 'c':
211
if self.ends("enci"): self.r("ence")
212
elif self.ends("anci"): self.r("ance")
213
elif self.b[self.k - 1] == 'e':
214
if self.ends("izer"): self.r("ize")
215
elif self.b[self.k - 1] == 'l':
216
if self.ends("bli"): self.r("ble") # --DEPARTURE--
217
# To match the published algorithm, replace this phrase with
218
# if self.ends("abli"): self.r("able")
219
elif self.ends("alli"): self.r("al")
220
elif self.ends("entli"): self.r("ent")
221
elif self.ends("eli"): self.r("e")
222
elif self.ends("ousli"): self.r("ous")
223
elif self.b[self.k - 1] == 'o':
224
if self.ends("ization"): self.r("ize")
225
elif self.ends("ation"): self.r("ate")
226
elif self.ends("ator"): self.r("ate")
227
elif self.b[self.k - 1] == 's':
228
if self.ends("alism"): self.r("al")
229
elif self.ends("iveness"): self.r("ive")
230
elif self.ends("fulness"): self.r("ful")
231
elif self.ends("ousness"): self.r("ous")
232
elif self.b[self.k - 1] == 't':
233
if self.ends("aliti"): self.r("al")
234
elif self.ends("iviti"): self.r("ive")
235
elif self.ends("biliti"): self.r("ble")
236
elif self.b[self.k - 1] == 'g': # --DEPARTURE--
237
if self.ends("logi"): self.r("log")
238
# To match the published algorithm, delete this phrase
241
"""step3() dels with -ic-, -full, -ness etc. similar strategy to step2."""
242
if self.b[self.k] == 'e':
243
if self.ends("icate"): self.r("ic")
244
elif self.ends("ative"): self.r("")
245
elif self.ends("alize"): self.r("al")
246
elif self.b[self.k] == 'i':
247
if self.ends("iciti"): self.r("ic")
248
elif self.b[self.k] == 'l':
249
if self.ends("ical"): self.r("ic")
250
elif self.ends("ful"): self.r("")
251
elif self.b[self.k] == 's':
252
if self.ends("ness"): self.r("")
255
"""step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
256
if self.b[self.k - 1] == 'a':
257
if self.ends("al"): pass
259
elif self.b[self.k - 1] == 'c':
260
if self.ends("ance"): pass
261
elif self.ends("ence"): pass
263
elif self.b[self.k - 1] == 'e':
264
if self.ends("er"): pass
266
elif self.b[self.k - 1] == 'i':
267
if self.ends("ic"): pass
269
elif self.b[self.k - 1] == 'l':
270
if self.ends("able"): pass
271
elif self.ends("ible"): pass
273
elif self.b[self.k - 1] == 'n':
274
if self.ends("ant"): pass
275
elif self.ends("ement"): pass
276
elif self.ends("ment"): pass
277
elif self.ends("ent"): pass
279
elif self.b[self.k - 1] == 'o':
280
if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass
281
elif self.ends("ou"): pass
284
elif self.b[self.k - 1] == 's':
285
if self.ends("ism"): pass
287
elif self.b[self.k - 1] == 't':
288
if self.ends("ate"): pass
289
elif self.ends("iti"): pass
291
elif self.b[self.k - 1] == 'u':
292
if self.ends("ous"): pass
294
elif self.b[self.k - 1] == 'v':
295
if self.ends("ive"): pass
297
elif self.b[self.k - 1] == 'z':
298
if self.ends("ize"): pass
306
"""step5() removes a final -e if m() > 1, and changes -ll to -l if
310
if self.b[self.k] == 'e':
312
if a > 1 or (a == 1 and not self.cvc(self.k-1)):
314
if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
317
def stem(self, p, i, j):
318
"""In stem(p,i,j), p is a char pointer, and the string to be stemmed
319
is from p[i] to p[j] inclusive. Typically i is zero and j is the
320
offset to the last character of a string, (p[j+1] == '\0'). The
321
stemmer adjusts the characters p[i] ... p[j] and returns the new
322
end-point of the string, k. Stemming never increases word length, so
323
i <= k <= j. To turn the stemmer into a module, declare 'stem' as
324
extern, and delete the remainder of this file.
326
# copy the parameters into statics
330
if self.k <= self.k0 + 1:
331
return self.b # --DEPARTURE--
333
# With this line, strings of length 1 or 2 don't go through the
334
# stemming process, although no mention is made of this in the
335
# published algorithm. Remove the line to match the published
344
return self.b[self.k0:self.k+1]
347
if __name__ == '__main__':
349
if len(sys.argv) > 1:
350
for f in sys.argv[1:]:
351
infile = open(f, 'r')
355
line = infile.readline()
363
output += p.stem(word, 0,len(word)-1)