| 1 |
# -*- coding: utf-8 -*- |
|---|
| 2 |
# |
|---|
| 3 |
# This software is licensed as described in the file COPYING, which |
|---|
| 4 |
# you should have received as part of this distribution. |
|---|
| 5 |
# |
|---|
| 6 |
|
|---|
| 7 |
"""Porter Stemming Algorithm |
|---|
| 8 |
|
|---|
| 9 |
This is the Porter stemming algorithm, ported to Python from the |
|---|
| 10 |
version coded up in ANSI C by the author. It may be be regarded |
|---|
| 11 |
as canonical, in that it follows the algorithm presented in |
|---|
| 12 |
|
|---|
| 13 |
Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, |
|---|
| 14 |
no. 3, pp 130-137, |
|---|
| 15 |
|
|---|
| 16 |
only differing from it at the points maked --DEPARTURE-- below. |
|---|
| 17 |
|
|---|
| 18 |
See also http://www.tartarus.org/~martin/PorterStemmer |
|---|
| 19 |
|
|---|
| 20 |
The algorithm as described in the paper could be exactly replicated |
|---|
| 21 |
by adjusting the points of DEPARTURE, but this is barely necessary, |
|---|
| 22 |
because (a) the points of DEPARTURE are definitely improvements, and |
|---|
| 23 |
(b) no encoding of the Porter stemmer I have seen is anything like |
|---|
| 24 |
as exact as this version, even with the points of DEPARTURE! |
|---|
| 25 |
|
|---|
| 26 |
Vivake Gupta (v@nano.com) |
|---|
| 27 |
|
|---|
| 28 |
Release 1: January 2001 |
|---|
| 29 |
""" |
|---|
| 30 |
|
|---|
| 31 |
import sys |
|---|
| 32 |
from pyndexter import PluginFactory |
|---|
| 33 |
|
|---|
| 34 |
class PorterStemmer(object): |
|---|
| 35 |
|
|---|
| 36 |
def __init__(self): |
|---|
| 37 |
"""The main part of the stemming algorithm starts here. |
|---|
| 38 |
b is a buffer holding a word to be stemmed. The letters are in b[k0], |
|---|
| 39 |
b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is |
|---|
| 40 |
readjusted downwards as the stemming progresses. Zero termination is |
|---|
| 41 |
not in fact used in the algorithm. |
|---|
| 42 |
|
|---|
| 43 |
Note that only lower case sequences are stemmed. Forcing to lower case |
|---|
| 44 |
should be done before stem(...) is called. |
|---|
| 45 |
""" |
|---|
| 46 |
|
|---|
| 47 |
self.b = "" # buffer for word to be stemmed |
|---|
| 48 |
self.k = 0 |
|---|
| 49 |
self.k0 = 0 |
|---|
| 50 |
self.j = 0 # j is a general offset into the string |
|---|
| 51 |
|
|---|
| 52 |
def cons(self, i): |
|---|
| 53 |
"""cons(i) is TRUE <=> b[i] is a consonant.""" |
|---|
| 54 |
if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u': |
|---|
| 55 |
return 0 |
|---|
| 56 |
if self.b[i] == 'y': |
|---|
| 57 |
if i == self.k0: |
|---|
| 58 |
return 1 |
|---|
| 59 |
else: |
|---|
| 60 |
return (not self.cons(i - 1)) |
|---|
| 61 |
return 1 |
|---|
| 62 |
|
|---|
| 63 |
def m(self): |
|---|
| 64 |
"""m() measures the number of consonant sequences between k0 and j. |
|---|
| 65 |
if c is a consonant sequence and v a vowel sequence, and <..> |
|---|
| 66 |
indicates arbitrary presence, |
|---|
| 67 |
|
|---|
| 68 |
<c><v> gives 0 |
|---|
| 69 |
<c>vc<v> gives 1 |
|---|
| 70 |
<c>vcvc<v> gives 2 |
|---|
| 71 |
<c>vcvcvc<v> gives 3 |
|---|
| 72 |
.... |
|---|
| 73 |
""" |
|---|
| 74 |
n = 0 |
|---|
| 75 |
i = self.k0 |
|---|
| 76 |
while 1: |
|---|
| 77 |
if i > self.j: |
|---|
| 78 |
return n |
|---|
| 79 |
if not self.cons(i): |
|---|
| 80 |
break |
|---|
| 81 |
i = i + 1 |
|---|
| 82 |
i = i + 1 |
|---|
| 83 |
while 1: |
|---|
| 84 |
while 1: |
|---|
| 85 |
if i > self.j: |
|---|
| 86 |
return n |
|---|
| 87 |
if self.cons(i): |
|---|
| 88 |
break |
|---|
| 89 |
i = i + 1 |
|---|
| 90 |
i = i + 1 |
|---|
| 91 |
n = n + 1 |
|---|
| 92 |
while 1: |
|---|
| 93 |
if i > self.j: |
|---|
| 94 |
return n |
|---|
| 95 |
if not self.cons(i): |
|---|
| 96 |
break |
|---|
| 97 |
i = i + 1 |
|---|
| 98 |
i = i + 1 |
|---|
| 99 |
|
|---|
| 100 |
def vowelinstem(self): |
|---|
| 101 |
"""vowelinstem() is TRUE <=> k0,...j contains a vowel""" |
|---|
| 102 |
for i in range(self.k0, self.j + 1): |
|---|
| 103 |
if not self.cons(i): |
|---|
| 104 |
return 1 |
|---|
| 105 |
return 0 |
|---|
| 106 |
|
|---|
| 107 |
def doublec(self, j): |
|---|
| 108 |
"""doublec(j) is TRUE <=> j,(j-1) contain a double consonant.""" |
|---|
| 109 |
if j < (self.k0 + 1): |
|---|
| 110 |
return 0 |
|---|
| 111 |
if (self.b[j] != self.b[j-1]): |
|---|
| 112 |
return 0 |
|---|
| 113 |
return self.cons(j) |
|---|
| 114 |
|
|---|
| 115 |
def cvc(self, i): |
|---|
| 116 |
"""cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant |
|---|
| 117 |
and also if the second c is not w,x or y. this is used when trying to |
|---|
| 118 |
restore an e at the end of a short e.g. |
|---|
| 119 |
|
|---|
| 120 |
cav(e), lov(e), hop(e), crim(e), but |
|---|
| 121 |
snow, box, tray. |
|---|
| 122 |
""" |
|---|
| 123 |
if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) or not self.cons(i-2): |
|---|
| 124 |
return 0 |
|---|
| 125 |
ch = self.b[i] |
|---|
| 126 |
if ch == 'w' or ch == 'x' or ch == 'y': |
|---|
| 127 |
return 0 |
|---|
| 128 |
return 1 |
|---|
| 129 |
|
|---|
| 130 |
def ends(self, s): |
|---|
| 131 |
"""ends(s) is TRUE <=> k0,...k ends with the string s.""" |
|---|
| 132 |
length = len(s) |
|---|
| 133 |
if s[length - 1] != self.b[self.k]: # tiny speed-up |
|---|
| 134 |
return 0 |
|---|
| 135 |
if length > (self.k - self.k0 + 1): |
|---|
| 136 |
return 0 |
|---|
| 137 |
if self.b[self.k-length+1:self.k+1] != s: |
|---|
| 138 |
return 0 |
|---|
| 139 |
self.j = self.k - length |
|---|
| 140 |
return 1 |
|---|
| 141 |
|
|---|
| 142 |
def setto(self, s): |
|---|
| 143 |
"""setto(s) sets (j+1),...k to the characters in the string s, readjusting k.""" |
|---|
| 144 |
length = len(s) |
|---|
| 145 |
self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:] |
|---|
| 146 |
self.k = self.j + length |
|---|
| 147 |
|
|---|
| 148 |
def r(self, s): |
|---|
| 149 |
"""r(s) is used further down.""" |
|---|
| 150 |
if self.m() > 0: |
|---|
| 151 |
self.setto(s) |
|---|
| 152 |
|
|---|
| 153 |
def step1ab(self): |
|---|
| 154 |
"""step1ab() gets rid of plurals and -ed or -ing. e.g. |
|---|
| 155 |
|
|---|
| 156 |
caresses -> caress |
|---|
| 157 |
ponies -> poni |
|---|
| 158 |
ties -> ti |
|---|
| 159 |
caress -> caress |
|---|
| 160 |
cats -> cat |
|---|
| 161 |
|
|---|
| 162 |
feed -> feed |
|---|
| 163 |
agreed -> agree |
|---|
| 164 |
disabled -> disable |
|---|
| 165 |
|
|---|
| 166 |
matting -> mat |
|---|
| 167 |
mating -> mate |
|---|
| 168 |
meeting -> meet |
|---|
| 169 |
milling -> mill |
|---|
| 170 |
messing -> mess |
|---|
| 171 |
|
|---|
| 172 |
meetings -> meet |
|---|
| 173 |
""" |
|---|
| 174 |
if self.b[self.k] == 's': |
|---|
| 175 |
if self.ends("sses"): |
|---|
| 176 |
self.k = self.k - 2 |
|---|
| 177 |
elif self.ends("ies"): |
|---|
| 178 |
self.setto("i") |
|---|
| 179 |
elif self.b[self.k - 1] != 's': |
|---|
| 180 |
self.k = self.k - 1 |
|---|
| 181 |
if self.ends("eed"): |
|---|
| 182 |
if self.m() > 0: |
|---|
| 183 |
self.k = self.k - 1 |
|---|
| 184 |
elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem(): |
|---|
| 185 |
self.k = self.j |
|---|
| 186 |
if self.ends("at"): self.setto("ate") |
|---|
| 187 |
elif self.ends("bl"): self.setto("ble") |
|---|
| 188 |
elif self.ends("iz"): self.setto("ize") |
|---|
| 189 |
elif self.doublec(self.k): |
|---|
| 190 |
self.k = self.k - 1 |
|---|
| 191 |
ch = self.b[self.k] |
|---|
| 192 |
if ch == 'l' or ch == 's' or ch == 'z': |
|---|
| 193 |
self.k = self.k + 1 |
|---|
| 194 |
elif (self.m() == 1 and self.cvc(self.k)): |
|---|
| 195 |
self.setto("e") |
|---|
| 196 |
|
|---|
| 197 |
def step1c(self): |
|---|
| 198 |
"""step1c() turns terminal y to i when there is another vowel in the stem.""" |
|---|
| 199 |
if (self.ends("y") and self.vowelinstem()): |
|---|
| 200 |
self.b = self.b[:self.k] + 'i' + self.b[self.k+1:] |
|---|
| 201 |
|
|---|
| 202 |
def step2(self): |
|---|
| 203 |
"""step2() maps double suffices to single ones. |
|---|
| 204 |
so -ization ( = -ize plus -ation) maps to -ize etc. note that the |
|---|
| 205 |
string before the suffix must give m() > 0. |
|---|
| 206 |
""" |
|---|
| 207 |
if self.b[self.k - 1] == 'a': |
|---|
| 208 |
if self.ends("ational"): self.r("ate") |
|---|
| 209 |
elif self.ends("tional"): self.r("tion") |
|---|
| 210 |
elif self.b[self.k - 1] == 'c': |
|---|
| 211 |
if self.ends("enci"): self.r("ence") |
|---|
| 212 |
elif self.ends("anci"): self.r("ance") |
|---|
| 213 |
elif self.b[self.k - 1] == 'e': |
|---|
| 214 |
if self.ends("izer"): self.r("ize") |
|---|
| 215 |
elif self.b[self.k - 1] == 'l': |
|---|
| 216 |
if self.ends("bli"): self.r("ble") # --DEPARTURE-- |
|---|
| 217 |
# To match the published algorithm, replace this phrase with |
|---|
| 218 |
# if self.ends("abli"): self.r("able") |
|---|
| 219 |
elif self.ends("alli"): self.r("al") |
|---|
| 220 |
elif self.ends("entli"): self.r("ent") |
|---|
| 221 |
elif self.ends("eli"): self.r("e") |
|---|
| 222 |
elif self.ends("ousli"): self.r("ous") |
|---|
| 223 |
elif self.b[self.k - 1] == 'o': |
|---|
| 224 |
if self.ends("ization"): self.r("ize") |
|---|
| 225 |
elif self.ends("ation"): self.r("ate") |
|---|
| 226 |
elif self.ends("ator"): self.r("ate") |
|---|
| 227 |
elif self.b[self.k - 1] == 's': |
|---|
| 228 |
if self.ends("alism"): self.r("al") |
|---|
| 229 |
elif self.ends("iveness"): self.r("ive") |
|---|
| 230 |
elif self.ends("fulness"): self.r("ful") |
|---|
| 231 |
elif self.ends("ousness"): self.r("ous") |
|---|
| 232 |
elif self.b[self.k - 1] == 't': |
|---|
| 233 |
if self.ends("aliti"): self.r("al") |
|---|
| 234 |
elif self.ends("iviti"): self.r("ive") |
|---|
| 235 |
elif self.ends("biliti"): self.r("ble") |
|---|
| 236 |
elif self.b[self.k - 1] == 'g': # --DEPARTURE-- |
|---|
| 237 |
if self.ends("logi"): self.r("log") |
|---|
| 238 |
# To match the published algorithm, delete this phrase |
|---|
| 239 |
|
|---|
| 240 |
def step3(self): |
|---|
| 241 |
"""step3() dels with -ic-, -full, -ness etc. similar strategy to step2.""" |
|---|
| 242 |
if self.b[self.k] == 'e': |
|---|
| 243 |
if self.ends("icate"): self.r("ic") |
|---|
| 244 |
elif self.ends("ative"): self.r("") |
|---|
| 245 |
elif self.ends("alize"): self.r("al") |
|---|
| 246 |
elif self.b[self.k] == 'i': |
|---|
| 247 |
if self.ends("iciti"): self.r("ic") |
|---|
| 248 |
elif self.b[self.k] == 'l': |
|---|
| 249 |
if self.ends("ical"): self.r("ic") |
|---|
| 250 |
elif self.ends("ful"): self.r("") |
|---|
| 251 |
elif self.b[self.k] == 's': |
|---|
| 252 |
if self.ends("ness"): self.r("") |
|---|
| 253 |
|
|---|
| 254 |
def step4(self): |
|---|
| 255 |
"""step4() takes off -ant, -ence etc., in context <c>vcvc<v>.""" |
|---|
| 256 |
if self.b[self.k - 1] == 'a': |
|---|
| 257 |
if self.ends("al"): pass |
|---|
| 258 |
else: return |
|---|
| 259 |
elif self.b[self.k - 1] == 'c': |
|---|
| 260 |
if self.ends("ance"): pass |
|---|
| 261 |
elif self.ends("ence"): pass |
|---|
| 262 |
else: return |
|---|
| 263 |
elif self.b[self.k - 1] == 'e': |
|---|
| 264 |
if self.ends("er"): pass |
|---|
| 265 |
else: return |
|---|
| 266 |
elif self.b[self.k - 1] == 'i': |
|---|
| 267 |
if self.ends("ic"): pass |
|---|
| 268 |
else: return |
|---|
| 269 |
elif self.b[self.k - 1] == 'l': |
|---|
| 270 |
if self.ends("able"): pass |
|---|
| 271 |
elif self.ends("ible"): pass |
|---|
| 272 |
else: return |
|---|
| 273 |
elif self.b[self.k - 1] == 'n': |
|---|
| 274 |
if self.ends("ant"): pass |
|---|
| 275 |
elif self.ends("ement"): pass |
|---|
| 276 |
elif self.ends("ment"): pass |
|---|
| 277 |
elif self.ends("ent"): pass |
|---|
| 278 |
else: return |
|---|
| 279 |
elif self.b[self.k - 1] == 'o': |
|---|
| 280 |
if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): pass |
|---|
| 281 |
elif self.ends("ou"): pass |
|---|
| 282 |
# takes care of -ous |
|---|
| 283 |
else: return |
|---|
| 284 |
elif self.b[self.k - 1] == 's': |
|---|
| 285 |
if self.ends("ism"): pass |
|---|
| 286 |
else: return |
|---|
| 287 |
elif self.b[self.k - 1] == 't': |
|---|
| 288 |
if self.ends("ate"): pass |
|---|
| 289 |
elif self.ends("iti"): pass |
|---|
| 290 |
else: return |
|---|
| 291 |
elif self.b[self.k - 1] == 'u': |
|---|
| 292 |
if self.ends("ous"): pass |
|---|
| 293 |
else: return |
|---|
| 294 |
elif self.b[self.k - 1] == 'v': |
|---|
| 295 |
if self.ends("ive"): pass |
|---|
| 296 |
else: return |
|---|
| 297 |
elif self.b[self.k - 1] == 'z': |
|---|
| 298 |
if self.ends("ize"): pass |
|---|
| 299 |
else: return |
|---|
| 300 |
else: |
|---|
| 301 |
return |
|---|
| 302 |
if self.m() > 1: |
|---|
| 303 |
self.k = self.j |
|---|
| 304 |
|
|---|
| 305 |
def step5(self): |
|---|
| 306 |
"""step5() removes a final -e if m() > 1, and changes -ll to -l if |
|---|
| 307 |
m() > 1. |
|---|
| 308 |
""" |
|---|
| 309 |
self.j = self.k |
|---|
| 310 |
if self.b[self.k] == 'e': |
|---|
| 311 |
a = self.m() |
|---|
| 312 |
if a > 1 or (a == 1 and not self.cvc(self.k-1)): |
|---|
| 313 |
self.k = self.k - 1 |
|---|
| 314 |
if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1: |
|---|
| 315 |
self.k = self.k -1 |
|---|
| 316 |
|
|---|
| 317 |
def stem(self, p, i, j): |
|---|
| 318 |
"""In stem(p,i,j), p is a char pointer, and the string to be stemmed |
|---|
| 319 |
is from p[i] to p[j] inclusive. Typically i is zero and j is the |
|---|
| 320 |
offset to the last character of a string, (p[j+1] == '\0'). The |
|---|
| 321 |
stemmer adjusts the characters p[i] ... p[j] and returns the new |
|---|
| 322 |
end-point of the string, k. Stemming never increases word length, so |
|---|
| 323 |
i <= k <= j. To turn the stemmer into a module, declare 'stem' as |
|---|
| 324 |
extern, and delete the remainder of this file. |
|---|
| 325 |
""" |
|---|
| 326 |
# copy the parameters into statics |
|---|
| 327 |
self.b = p |
|---|
| 328 |
self.k = j |
|---|
| 329 |
self.k0 = i |
|---|
| 330 |
if self.k <= self.k0 + 1: |
|---|
| 331 |
return self.b # --DEPARTURE-- |
|---|
| 332 |
|
|---|
| 333 |
# With this line, strings of length 1 or 2 don't go through the |
|---|
| 334 |
# stemming process, although no mention is made of this in the |
|---|
| 335 |
# published algorithm. Remove the line to match the published |
|---|
| 336 |
# algorithm. |
|---|
| 337 |
|
|---|
| 338 |
self.step1ab() |
|---|
| 339 |
self.step1c() |
|---|
| 340 |
self.step2() |
|---|
| 341 |
self.step3() |
|---|
| 342 |
self.step4() |
|---|
| 343 |
self.step5() |
|---|
| 344 |
return self.b[self.k0:self.k+1] |
|---|
| 345 |
|
|---|
| 346 |
def __call__(self, word): |
|---|
| 347 |
return self.stem(word, 0, len(word) - 1) |
|---|
| 348 |
|
|---|
| 349 |
|
|---|
| 350 |
stemmer_factory = PluginFactory(PorterStemmer) |
|---|