~snowball-yiddish-dev/snowball-yiddish/trunk

« back to all changes in this revision

Viewing changes to pystemmer/src/Stemmer.pyx

  • Committer: richard
  • Date: 2011-08-09 15:48:33 UTC
  • Revision ID: svn-v4:633ccae0-01f4-0310-8c99-d3591da6f01f:trunk:547
* *: Patch from Peter Bouda, with some small tweaks, to support
  python 3.X.  Tested with python 3.2rc3
* Incompatibility: in python 2.X, algorithms() now returns unicode
  strings, instead of byte strings.
* Update to use latest libstemmer.
* Bump version number to 1.2.0

Show diffs side-by-side

added added

removed removed

Lines of Context:
27
27
# "epydoc" tool.  Invoke it by compiling this module and then running:
28
28
# "epydoc Stemmer.so".
29
29
 
 
30
cdef extern from *:
 
31
    ctypedef char** const_char_ptr_ptr "const char **"
 
32
    
30
33
cdef extern from "Python.h":
31
 
    object PyString_FromStringAndSize (char * s, int len)
 
34
    object PyUnicode_FromStringAndSize (char * s, int len)
32
35
 
33
36
cdef extern from "libstemmer.h":
34
37
    cdef struct sb_stemmer
35
38
    ctypedef unsigned char sb_symbol
36
39
 
37
 
    cdef char **      sb_stemmer_list()
 
40
    cdef const_char_ptr_ptr sb_stemmer_list()
38
41
    cdef sb_stemmer * sb_stemmer_new(char * algorithm, char * charenc)
39
42
    cdef void         sb_stemmer_delete(sb_stemmer * stemmer)
40
43
    cdef sb_symbol *  sb_stemmer_stem(sb_stemmer * stemmer, sb_symbol * word, int size)
57
60
    instead of the "porter" algorithm.
58
61
 
59
62
    """
60
 
    cdef char ** algs
 
63
    cdef const_char_ptr_ptr algs
61
64
    cdef int i
62
65
    py_algs = []
63
66
    algs = sb_stemmer_list()
64
67
    i = 0
65
68
    while algs[i] != NULL:
66
 
        py_algs.append(algs[i])
 
69
        alg = algs[i]
 
70
        alg = alg.decode(u"ascii")
 
71
        py_algs.append(alg)
67
72
        i = i + 1
68
73
    return py_algs
69
74
 
74
79
    individual stemming algorithm).
75
80
 
76
81
    """
77
 
    return '1.1.0'
 
82
    return '1.2.0'
78
83
 
79
84
cdef class Stemmer:
80
85
    """An instance of a stemming algorithm.
115
120
        See the class documentation for details.
116
121
 
117
122
        """
118
 
        self.cobj = sb_stemmer_new(algorithm, 'UTF_8')
 
123
        alg = algorithm.encode(u'ascii')
 
124
        self.cobj = sb_stemmer_new(alg, 'UTF_8')
119
125
        if self.cobj == NULL:
120
126
            raise KeyError("Stemming algorithm '%s' not found" % algorithm)
121
127
        self.max_cache_size = maxCacheSize
171
177
        was_unicode = 0
172
178
        if isinstance(word, unicode):
173
179
            was_unicode = 1
174
 
            word = word.encode('utf-8');
 
180
            word = word.encode(u'utf-8');
175
181
 
176
182
        if self.max_cache_size > 0:
177
183
            try:
183
189
                c_word = word
184
190
                c_word = <char*>sb_stemmer_stem(self.cobj, <sb_symbol*>c_word, len(word))
185
191
                length = sb_stemmer_length(self.cobj)
186
 
                result = PyString_FromStringAndSize (c_word, length)
 
192
                result = PyUnicode_FromStringAndSize (c_word, length)
187
193
                self.cache[word] = [result, self.counter]
188
194
                self.counter = self.counter + 1
189
195
                self.__purgeCache()
191
197
            c_word = word
192
198
            c_word = <char*>sb_stemmer_stem(self.cobj, <sb_symbol*>c_word, len(word))
193
199
            length = sb_stemmer_length(self.cobj)
194
 
            result = PyString_FromStringAndSize (c_word, length)
 
200
            result = PyUnicode_FromStringAndSize (c_word, length)
195
201
 
196
 
        if was_unicode:
197
 
            return result.decode('utf-8')
 
202
        if not was_unicode:
 
203
            return result.encode(u'utf-8')
198
204
        return result
199
205
 
200
206
    def stemWords (self, words):