root/pyndexter/trunk/pyndexter/stemmers/_porter.py

Revision 452, 12.1 kB (checked in by athomas, 1 year ago)

pyndexter: All modules are now prefixed with _ to avoid import collisions. Updated unit tests.

  • Property svn:eol-style set to native
Line 
1 # -*- coding: utf-8 -*-
2 #
3 # This software is licensed as described in the file COPYING, which
4 # you should have received as part of this distribution.
5 #
6
7 """Porter Stemming Algorithm
8
9 This is the Porter stemming algorithm, ported to Python from the
10 version coded up in ANSI C by the author. It may be be regarded
11 as canonical, in that it follows the algorithm presented in
12
13 Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
14 no. 3, pp 130-137,
15
16 only differing from it at the points maked --DEPARTURE-- below.
17
18 See also http://www.tartarus.org/~martin/PorterStemmer
19
20 The algorithm as described in the paper could be exactly replicated
21 by adjusting the points of DEPARTURE, but this is barely necessary,
22 because (a) the points of DEPARTURE are definitely improvements, and
23 (b) no encoding of the Porter stemmer I have seen is anything like
24 as exact as this version, even with the points of DEPARTURE!
25
26 Vivake Gupta (v@nano.com)
27
28 Release 1: January 2001
29 """
30
31 import sys
32 from pyndexter import PluginFactory
33
34 class PorterStemmer(object):
35
36     def __init__(self):
37         """The main part of the stemming algorithm starts here.
38         b is a buffer holding a word to be stemmed. The letters are in b[k0],
39         b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
40         readjusted downwards as the stemming progresses. Zero termination is
41         not in fact used in the algorithm.
42
43         Note that only lower case sequences are stemmed. Forcing to lower case
44         should be done before stem(...) is called.
45         """
46
47         self.b = ""  # buffer for word to be stemmed
48         self.k = 0
49         self.k0 = 0
50         self.j = 0   # j is a general offset into the string
51
52     def cons(self, i):
53         """cons(i) is TRUE <=> b[i] is a consonant."""
54         if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u':
55             return 0
56         if self.b[i] == 'y':
57             if i == self.k0:
58                 return 1
59             else:
60                 return (not self.cons(i - 1))
61         return 1
62
63     def m(self):
64         """m() measures the number of consonant sequences between k0 and j.
65         if c is a consonant sequence and v a vowel sequence, and <..>
66         indicates arbitrary presence,
67
68            <c><v>       gives 0
69            <c>vc<v>     gives 1
70            <c>vcvc<v>   gives 2
71            <c>vcvcvc<v> gives 3
72            ....
73         """
74         n = 0
75         i = self.k0
76         while 1:
77             if i > self.j:
78                 return n
79             if not self.cons(i):
80                 break
81             i = i + 1
82         i = i + 1
83         while 1:
84             while 1:
85                 if i > self.j:
86                     return n
87                 if self.cons(i):
88                     break
89                 i = i + 1
90             i = i + 1
91             n = n + 1
92             while 1:
93                 if i > self.j:
94                     return n
95                 if not self.cons(i):
96                     break
97                 i = i + 1
98             i = i + 1
99
100     def vowelinstem(self):
101         """vowelinstem() is TRUE <=> k0,...j contains a vowel"""
102         for i in range(self.k0, self.j + 1):
103             if not self.cons(i):
104                 return 1
105         return 0
106
107     def doublec(self, j):
108         """doublec(j) is TRUE <=> j,(j-1) contain a double consonant."""
109         if j < (self.k0 + 1):
110             return 0
111         if (self.b[j] != self.b[j-1]):
112             return 0
113         return self.cons(j)
114
115     def cvc(self, i):
116         """cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
117         and also if the second c is not w,x or y. this is used when trying to
118         restore an e at the end of a short  e.g.
119
120            cav(e), lov(e), hop(e), crim(e), but
121            snow, box, tray.
122         """
123         if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) or not self.cons(i-2):
124             return 0
125         ch = self.b[i]
126         if ch == 'w' or ch == 'x' or ch == 'y':
127             return 0
128         return 1
129
130     def ends(self, s):
131         """ends(s) is TRUE <=> k0,...k ends with the string s."""
132         length = len(s)
133         if s[length - 1] != self.b[self.k]: # tiny speed-up
134             return 0
135         if length > (self.k - self.k0 + 1):
136             return 0
137         if self.b[self.k-length+1:self.k+1] != s:
138             return 0
139         self.j = self.k - length
140         return 1
141
142     def setto(self, s):
143         """setto(s) sets (j+1),...k to the characters in the string s, readjusting k."""
144         length = len(s)
145         self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:]
146         self.k = self.j + length
147
148     def r(self, s):
149         """r(s) is used further down."""
150         if self.m() > 0:
151             self.setto(s)
152
153     def step1ab(self):
154         """step1ab() gets rid of plurals and -ed or -ing. e.g.
155
156            caresses  ->  caress
157            ponies    ->  poni
158            ties      ->  ti
159            caress    ->  caress
160            cats      ->  cat
161
162            feed      ->  feed
163            agreed    ->  agree
164            disabled  ->  disable
165
166            matting   ->  mat
167            mating    ->  mate
168            meeting   ->  meet
169            milling   ->  mill
170            messing   ->  mess
171
172            meetings  ->  meet
173         """
174         if self.b[self.k] == 's':
175             if self.ends("sses"):
176                 self.k = self.k - 2
177             elif self.ends("ies"):
178                 self.setto("i")
179             elif self.b[self.k - 1] != 's':
180                 self.k = self.k - 1
181         if self.ends("eed"):
182             if self.m() > 0:
183                 self.k = self.k - 1
184         elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
185             self.k = self.j
186             if self.ends("at"):   self.setto("ate")
187             elif self.ends("bl"): self.setto("ble")
188             elif self.ends("iz"): self.setto("ize")
189             elif self.doublec(self.k):
190                 self.k = self.k - 1
191                 ch = self.b[self.k]
192                 if ch == 'l' or ch == 's' or ch == 'z':
193                     self.k = self.k + 1
194             elif (self.m() == 1 and self.cvc(self.k)):
195                 self.setto("e")
196
197     def step1c(self):
198         """step1c() turns terminal y to i when there is another vowel in the stem."""
199         if (self.ends("y") and self.vowelinstem()):
200             self.b = self.b[:self.k] + 'i' + self.b[self.k+1:]
201
202     def step2(self):
203         """step2() maps double suffices to single ones.
204         so -ization ( = -ize plus -ation) maps to -ize etc. note that the
205         string before the suffix must give m() > 0.
206         """
207         if self.b[self.k - 1] == 'a':
208             if self.ends("ational"):   self.r("ate")
209             elif self.ends("tional"):  self.r("tion")
210         elif self.b[self.k - 1] == 'c':
211             if self.ends("enci"):      self.r("ence")
212             elif self.ends("anci"):    self.r("ance")
213         elif self.b[self.k - 1] == 'e':
214             if self.ends("izer"):      self.r("ize")
215         elif self.b[self.k - 1] == 'l':
216             if self.ends("bli"):       self.r("ble") # --DEPARTURE--
217             # To match the published algorithm, replace this phrase with
218             #   if self.ends("abli"):      self.r("able")
219             elif self.ends("alli"):    self.r("al")
220             elif self.ends("entli"):   self.r("ent")
221             elif self.ends("eli"):     self.r("e")
222             elif self.ends("ousli"):   self.r("ous")
223         elif self.b[self.k - 1] == 'o':
224             if self.ends("ization"):   self.r("ize")
225             elif self.ends("ation"):   self.r("ate")
226             elif self.ends("ator"):    self.r("ate")
227         elif self.b[self.k - 1] == 's':
228             if self.ends("alism"):     self.r("al")
229             elif self.ends("iveness"): self.r("ive")
230             elif self.ends("fulness"): self.r("ful")
231             elif self.ends("ousness"): self.r("ous")
232         elif self.b[self.k - 1] == 't':
233             if self.ends("aliti"):     self.r("al")
234             elif self.ends("iviti"):   self.r("ive")
235             elif self.ends("biliti"):  self.r("ble")
236         elif self.b[self.k - 1] == 'g': # --DEPARTURE--
237             if self.ends("logi"):      self.r("log")
238         # To match the published algorithm, delete this phrase
239
240     def step3(self):
241         """step3() dels with -ic-, -full, -ness etc. similar strategy to step2."""
242         if self.b[self.k] == 'e':
243             if self.ends("icate"):     self.r("ic")
244             elif self.ends("ative"):   self.r("")
245             elif self.ends("alize"):   self.r("al")
246         elif self.b[self.k] == 'i':
247             if self.ends("iciti"):     self.r("ic")
248         elif self.b[self.k] == 'l':
249             if self.ends("ical"):      self.r("ic")
250             elif self.ends("ful"):     self.r("")
251         elif self.b[self.k] == 's':
252             if self.ends("ness"):      self.r("")
253
254     def step4(self):
255         """step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
256         if self.b[self.k - 1] == 'a':
257             if self.ends("al"): pass
258             else: return
259         elif self.b[self.k - 1] == 'c':
260             if self.ends("ance"): pass
261             elif self.ends("ence"): pass
262             else: return
263         elif self.b[self.k - 1] == 'e':
264             if self.ends("er"): pass
265             else: return
266         elif self.b[self.k - 1] == 'i':
267             if self.ends("ic"): pass
268             else: return
269         elif self.b[self.k - 1] == 'l':
270             if self.ends("able"): pass
271             elif self.ends("ible"): pass
272             else: return
273         elif self.b[self.k - 1] == 'n':
274             if self.ends("ant"): pass
275             elif self.ends("ement"): pass
276             elif self.ends("ment"): pass
277             elif self.ends("ent"): pass
278             else: return
279         elif self.b[self.k - 1] == 'o':
280             if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass
281             elif self.ends("ou"): pass
282             # takes care of -ous
283             else: return
284         elif self.b[self.k - 1] == 's':
285             if self.ends("ism"): pass
286             else: return
287         elif self.b[self.k - 1] == 't':
288             if self.ends("ate"): pass
289             elif self.ends("iti"): pass
290             else: return
291         elif self.b[self.k - 1] == 'u':
292             if self.ends("ous"): pass
293             else: return
294         elif self.b[self.k - 1] == 'v':
295             if self.ends("ive"): pass
296             else: return
297         elif self.b[self.k - 1] == 'z':
298             if self.ends("ize"): pass
299             else: return
300         else:
301             return
302         if self.m() > 1:
303             self.k = self.j
304
305     def step5(self):
306         """step5() removes a final -e if m() > 1, and changes -ll to -l if
307         m() > 1.
308         """
309         self.j = self.k
310         if self.b[self.k] == 'e':
311             a = self.m()
312             if a > 1 or (a == 1 and not self.cvc(self.k-1)):
313                 self.k = self.k - 1
314         if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
315             self.k = self.k -1
316
317     def stem(self, p, i, j):
318         """In stem(p,i,j), p is a char pointer, and the string to be stemmed
319         is from p[i] to p[j] inclusive. Typically i is zero and j is the
320         offset to the last character of a string, (p[j+1] == '\0'). The
321         stemmer adjusts the characters p[i] ... p[j] and returns the new
322         end-point of the string, k. Stemming never increases word length, so
323         i <= k <= j. To turn the stemmer into a module, declare 'stem' as
324         extern, and delete the remainder of this file.
325         """
326         # copy the parameters into statics
327         self.b = p
328         self.k = j
329         self.k0 = i
330         if self.k <= self.k0 + 1:
331             return self.b # --DEPARTURE--
332
333         # With this line, strings of length 1 or 2 don't go through the
334         # stemming process, although no mention is made of this in the
335         # published algorithm. Remove the line to match the published
336         # algorithm.
337
338         self.step1ab()
339         self.step1c()
340         self.step2()
341         self.step3()
342         self.step4()
343         self.step5()
344         return self.b[self.k0:self.k+1]
345
346     def __call__(self, word):
347         return self.stem(word, 0, len(word) - 1)
348
349
350 stemmer_factory = PluginFactory(PorterStemmer)
Note: See TracBrowser for help on using the browser.