2
from csc.nl import NLTools, get_nl, get_wordlist, get_mapping
9
class lazy_property(object):
10
def __init__(self, func):
12
A lazy decorator. Runs a function only once to get a
13
property's value; after that, the precomputed value is used.
15
Replace expensive computations in __init__ with this.
18
self.__name__ = func.__name__
19
self.__doc__ = func.__doc__
20
self.__dict__.update(func.__dict__)
22
def __get__(self, instance, cls):
23
assert self.__name__ not in instance.__dict__
24
result = instance.__dict__[self.__name__] = self.func(instance)
28
def preset(cls, name, val):
29
cls.__dict__[name] = val
31
# For .all_concepts, only include concepts where we know more than this number of things.
34
class EuroNL(NLTools):
36
A language that generally follows our assumptions about European languages,
39
- Words are made of uppercase and lowercase letters, which are variant
40
forms of each other, and apostrophes, which are kind of special.
41
- Words are separated by spaces or punctuation.
43
Only the subclasses of EuroNL -- :class:`StemmedEuroNL` and
44
:class:`LemmatizedEuroNL` -- implement all of the NLTools operations.
46
# TODO: Refactor this so that stemming languages and lemmatizing languages
49
punctuation = ''.join(c for c in string.punctuation
52
def __init__(self, lang, exceptions=None):
53
if exceptions is None:
56
self.exceptions = exceptions
57
self.exceptions_rev = {}
58
for key, value in exceptions.items():
59
self.exceptions_rev[value] = key
63
return get_wordlist(self.lang, 'blacklist')
67
return get_wordlist(self.lang, 'stop')
70
def frequencies(self):
71
from csc.nl.models import Frequency
72
return set([x.text for x in
73
Frequency.objects.filter(language__id=self.lang)])
76
def all_concepts(self):
77
'''Set of all concept text strings (not model objects)'''
78
from csc.conceptnet.models import Concept
79
return set(Concept.objects.filter(language__id=self.lang, num_assertions__gt=CUTOFF).values_list('text', flat=True))
83
return get_mapping(self.lang, 'swap4')
86
def autocorrect(self):
87
return get_mapping(self.lang, 'autocorrect')
89
def tokenize(self, text):
91
Tokenizing a sentence inserts spaces in such a way that it separates
92
punctuation from words, splits up contractions, and generally does what
93
a lot of natural language tools (especially parsers) expect their
96
>>> en_nl.tokenize("Time is an illusion. Lunchtime, doubly so.")
97
'Time is an illusion . Lunchtime , doubly so .'
99
... "Very deep," said Arthur, "you should send that in to the
100
... Reader's Digest. They've got a page for people like you."
102
>>> tok = en_nl.tokenize(untok)
104
"`` Very deep , '' said Arthur , `` you should send that in to the Reader 's Digest . They 've got a page for people like you . ''"
105
>>> en_nl.untokenize(tok)
106
'"Very deep," said Arthur, "you should send that in to the Reader\'s Digest. They\'ve got a page for people like you."'
107
>>> en_nl.untokenize(tok) == untok.replace('\n', ' ').strip()
111
step0 = text.replace('\r', '').replace('\n', ' ')
112
step1 = step0.replace(" '", " ` ").replace("'", " '").replace("n 't",
113
" n't").replace("cannot", "can not")
114
step2 = re.sub('"([^"]*)"', r" `` \1 '' ", step1)
115
step3 = re.sub(r'([.,:;?!%]+) ', r" \1 ", step2)
116
step4 = re.sub(r'([.,:;?!%]+)$', r" \1", step3)
117
step5 = re.sub(r'([()])', r" \1 ", step4)
118
return re.sub(r' +', ' ', step5).strip()
120
def untokenize(self, text):
122
Untokenizing a text undoes the tokenizing operation, restoring
123
punctuation and spaces to the places that people expect them to be.
125
Ideally, `untokenize(tokenize(text))` should be identical to `text`,
126
except for line breaks.
128
step1 = text.replace("`` ", '"').replace(" ''", '"')
129
step2 = step1.replace(" ( ", " (").replace(" ) ", ") ")
130
step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2)
131
step4 = re.sub(r' ([.,:;?!%]+)$', r"\1", step3)
132
step5 = step4.replace(" '", "'").replace(" n't", "n't").replace(
134
step6 = step5.replace(" ` ", " '")
137
def canonicalize(self, word):
139
Reduce equivalent characters to a canonical form.
141
In a EuroNL, by default, this puts those characters in lowercase.
145
def is_stopword(self, word):
147
A *stopword* is a word that contributes little to the semantic meaning
148
of a text and should be ignored. These tend to be short, common words
149
such as "of", "the", and "you".
151
Stopwords are often members of closed classes such as articles and
154
Whether a word is a stopword or not is a judgement call that depends on
155
the application. In ConceptNet, we began with the stock lists of
156
stopwords from NLTK, but we have refined and tweaked the lists
157
(especially in English) over the years.
161
>>> en_nl.is_stopword('the')
163
>>> en_nl.is_stopword('THE')
165
>>> en_nl.is_stopword('defenestrate')
168
>>> pt_nl = get_nl('pt') # This time, in Portuguese
169
>>> pt_nl.is_stopword('os')
171
>>> pt_nl.is_stopword('the')
174
return self.canonicalize(word) in self.stopwords
176
def is_blacklisted(self, text):
178
The blacklist is used to discover and discard particularly unhelpful
181
A phrase is considered "blacklisted" if *every* word in it appears on
182
the blacklist. The empty string is always blacklisted.
184
>>> en_nl.is_blacklisted('x')
186
>>> en_nl.is_blacklisted('the')
188
>>> en_nl.is_blacklisted('a b c d')
190
>>> en_nl.is_blacklisted('a b c d puppies')
194
if not isinstance(text, unicode): text = text.decode('utf-8')
195
words = self.tokenize(text).split(' ')
197
if self.canonicalize(word) not in self.blacklist: return False
200
def is_frequency(self, word):
202
Return whether this word represents a frequency.
204
>>> en_nl = get_nl('en')
205
>>> en_nl.is_frequency('sometimes')
207
>>> en_nl.is_frequency('somewhere')
210
>>> es_nl = get_nl('es') # This time, in Spanish
211
>>> es_nl.is_frequency('nunca')
213
>>> es_nl.is_frequency('never')
217
return self.canonicalize(word) in self.frequencies
219
def get_frequency(self, text):
221
If the text contains a frequency, return it. The first frequency that
222
occurs takes precedence, if there are multiple.
224
>>> en_nl.get_frequency('Never trust a skinny chef.')
226
>>> en_nl.get_frequency('This statement is true.')
227
>>> en_nl.get_frequency('This statement is not always true.')
231
if not isinstance(text, unicode): text = text.decode('utf-8')
232
words = self.tokenize(text).split(' ')
234
if self.canonicalize(word) in self.frequencies:
235
return self.canonicalize(word)
238
def get_words(self, text, strip_stopwords=False):
240
Given a sentence, split it into words, stripping punctuation etc.
242
text = self.tokenize(text)
243
punct = self.punctuation
244
words = text.replace('/', ' ').split()
245
words = (w.strip(punct).lower() for w in words)
246
words = (self.autocorrect.get(word, word) for word in words if word)
248
words = (word for word in words if not self.is_stopword(word))
251
def get_windows(self, words, window_size=2, join_words=True):
253
Extract windows from the list of words.
255
>>> en_nl.get_windows(['sit', 'on', 'couches'], window_size=1)
256
['sit', 'on', 'couches']
257
>>> en_nl.get_windows(['sit', 'on', 'couches'], window_size=2)
258
['sit on', 'sit', 'on couches', 'on', 'couches']
259
>>> en_nl.get_windows(['sit', 'on', 'couches'], window_size=3)
260
['sit on couches', 'sit on', 'sit', 'on couches', 'on', 'couches']
261
>>> en_nl.get_windows(['sit', 'on', 'couches'], window_size=2, join_words=False)
262
[['sit', 'on'], ['sit'], ['on', 'couches'], ['on'], ['couches']]
265
windows = (words[i:i+wsize]
266
for i in xrange(nwords)
267
for wsize in xrange(min(window_size, nwords-i), 0, -1))
269
return [' '.join(window) for window in windows]
273
def extract_concepts(self, text, max_words=2, check_conceptnet=False, also_allow=[]):
275
Extract a list of the concepts that are directly present in ``text``.
277
``max_words`` specifies the maximum number of words in the concept.
279
If ``check_conceptnet`` is True, only concepts that are in
280
ConceptNet for this language will be returned. ``also_allow``
281
is a list or set of concepts that are additionally allowed.
283
>>> en_nl.extract_concepts('People can be eating glimlings.', max_words=1, check_conceptnet=False)
284
[u'person', u'eat', u'glimling']
285
>>> en_nl.extract_concepts('People can be eating glimlings.', max_words=1, check_conceptnet=True)
287
>>> en_nl.extract_concepts('People can be eating rice.', max_words=2, check_conceptnet=True)
288
[u'person eat', u'person', u'eat rice', u'eat', u'rice']
290
words = self.normalize(text).split()
291
windows = self.get_windows(words, window_size=max_words)
293
return [concept for concept in windows
294
if concept in self.all_concepts
295
or concept in also_allow]
300
class LemmatizedEuroNL(EuroNL):
302
def lemmatizer(self):
304
The `.lemmatizer` property lazily loads an MBLEM lemmatizer from the
305
disk. The resulting object is an instance of
306
:class:`csc.nl.mblem.trie.Trie`.
308
if not hasattr(self, '_lemmatizer'):
309
from csc.nl.mblem import get_mblem
310
self._lemmatizer = get_mblem(self.lang)
311
return self._lemmatizer
314
def unlemmatizer(self):
316
The `.unlemmatizer` property lazily loads an MBLEM unlemmatizer from
317
the disk. The resulting object is a dictionary of tries, one for each
318
possible combination of part-of-speech and inflection that can be
321
if not hasattr(self, '_unlemmatizer'):
322
from csc.nl.mblem import get_unlem
323
self._unlemmatizer = get_unlem(self.lang)
324
return self._unlemmatizer
326
def word_split(self, word):
328
Divide a single word into a string representing its *lemma form* (its
329
base form without inflections), and a second string representing the
330
inflections that were removed.
332
Instead of abstract symbols for the inflection, we currently represent
333
inflections as their most common natural language string. For example,
334
the inflection string 's' represents both "plural" and "third-person
337
This odd representation basically makes the assumption that, when two
338
inflections look the same, they will act the same on any word. Thus, we
339
can avoid trying to disambiguate different inflections when they will
340
never make a difference. (There are cases where this is not technically
341
correct, such as "leafs/leaves" in "there were leaves on the ground"
342
versus "he leafs through the pages", but we don't lose sleep over it.)
344
>>> en_nl.word_split(u'lemmatizing')
345
(u'lemmatize', u'ing')
346
>>> en_nl.word_split(u'cow')
348
>>> en_nl.word_split(u'went')
350
>>> en_nl.word_split(u'people')
353
if word in self.exceptions:
354
return self.exceptions[word]
356
lemma, pos, infl = self.lemmatizer.mblem(word)[0]
357
residue = self.unlemmatizer[pos, infl].leaves()[0].add
358
return (lemma, residue)
362
def lemma_split(self, text, keep_stopwords=False):
364
When you *lemma split* or *lemma factor* a string, you get two strings
367
1. The *normal form*, a string containing all the lemmas of the
368
non-stopwords in the string.
369
2. The *residue*, a string containing all the stopwords and the
370
inflections that were removed.
372
These two strings can be recombined with :meth:`lemma_combine`.
374
>>> en_nl.lemma_split("This is the testiest test that ever was tested")
375
(u'testy test ever test', u'this is the 1iest 2 that 3 was 4ed')
377
if not isinstance(text, unicode): text = text.decode('utf-8')
378
text = self.tokenize(text)
379
punct = string.punctuation.replace("'", "").replace('-',
382
words = text.replace('/', ' ').split()
383
words = [w.strip(punct).lower() for w in words]
384
words = [self.autocorrect.get(word, word) for word in words if word]
385
lemma_tuples = [self.word_split(word) for word in words]
389
for i in range(len(words)):
390
if not keep_stopwords and words[i] in self.stopwords:
391
residue_pre.append((None, words[i]))
393
lemmas_pre.append((lemma_tuples[i][0], lemma_index))
394
residue_pre.append((lemma_index, lemma_tuples[i][1]))
397
permute = [l[1] for l in lemmas_pre]
398
invpermute = [permute.index(i) for i in range(len(permute))]
399
lemmas = [l[0] for l in lemmas_pre]
400
lemmas = [self.swapdict.get(lemma, lemma) for lemma in lemmas]
403
for lemma_index, ltext in residue_pre:
404
if lemma_index is None: residue.append(ltext)
405
else: residue.append(str(invpermute[lemma_index]+1) + ltext)
406
if len(lemmas) == 0 and not keep_stopwords:
407
return self.lemma_split(text, keep_stopwords=True)
408
return (u' '.join(lemmas), u' '.join(residue))
409
lemma_factor = lemma_split
411
def normalize(self, text):
413
When you *normalize* a string (no relation to the operation of
414
normalizing a vector), you remove its stopwords and inflections so that
415
it becomes equivalent to similar strings.
417
Normalizing involves running :meth:`lemma_split` and keeping only the
418
first factor, thus discarding the information that would be used to
419
reconstruct the full string.
421
>>> en_nl.normalize("This is the testiest test that ever was tested")
422
u'testy test ever test'
424
return self.lemma_split(text)[0]
425
normalize4 = normalize
427
def lemma_combine(self, lemmas, residue):
429
This is the inverse of :meth:`lemma_factor` -- it takes in a normal
430
form and a residue, and re-assembles them into a phrase that is
431
hopefully comprehensible.
433
>>> en_nl.lemma_combine(u'testy test ever test',
434
... u'this is the 1iest 2 that 3 was 4ed')
435
u'this is the testiest test that ever was tested'
436
>>> en_nl.lemma_combine(u'person', u'1s')
440
lemmas = lemmas.split(' ')
441
for res in residue.split(' '):
442
if res and res[0] in '0123456789':
443
numstr, pos, infl = self.lemmatizer.mblem(res)[0]
444
while numstr[-1] not in '0123456789': numstr = numstr[:-1]
445
rest = res[len(numstr):]
447
lemma = lemmas[num-1]
448
if (lemma, rest) in self.exceptions_rev:
449
words.append(self.exceptions_rev[(lemma, rest)])
451
inflected = self.unlemmatizer[pos, infl].unlem(lemma)[0]
452
words.append(inflected)
455
return self.untokenize(' '.join(words))
457
class StemmedEuroNL(EuroNL):
460
if not hasattr(self, '_stemmer'):
461
from Stemmer import Stemmer
462
self._stemmer = Stemmer(self.lang)
465
def stem_word(self, word):
466
return self.stemmer.stemWord(word)
468
def word_split(self, word):
469
stem = self.stem_word(word)
470
residue = word[len(stem):]
471
return (stem, residue)
473
def is_stopword(self, word):
474
return word in self.stopwords
476
def normalize(self, text):
477
if not isinstance(text, unicode): text = text.decode('utf-8')
478
punct = string.punctuation.replace("'", "")
479
words = text.replace('/', ' ').replace('-', ' ').split()
480
words = [w.strip(punct).lower() for w in words]
481
words = [w for w in words if not self.is_stopword(w)]
482
words = [self.stem_word(w) for w in words]
484
return u" ".join(words)