~kenneth-arnold/luminoso/divisi2-port

« back to all changes in this revision

Viewing changes to luminoso/lib/standalone_nlp/euro.py

Committer: Robert Speer
Date: 2010-02-24 21:03:39 UTC
Revision ID: rspeer@new-caledonia.media.mit.edu-20100224210339-3fha4qwfh0lkfp1i

making the mac build work better

files added:
DMGSkeleton

DMGSkeleton/.DS_Store

DMGSkeleton/AnalogySpace

DMGSkeleton/AnalogySpace/Canonical

DMGSkeleton/AnalogySpace/Documents

DMGSkeleton/AnalogySpace/Matrices

DMGSkeleton/AnalogySpace/Matrices/conceptnet.pickle

DMGSkeleton/AnalogySpace/Results

DMGSkeleton/AnalogySpace/settings.json

DMGSkeleton/Luminoso documentation.webloc

DMGSkeleton/ThaiFoodStudy

DMGSkeleton/ThaiFoodStudy/Canonical

DMGSkeleton/ThaiFoodStudy/Canonical/canonical_chinese.txt

DMGSkeleton/ThaiFoodStudy/Canonical/canonical_thai.txt

DMGSkeleton/ThaiFoodStudy/Canonical/good_review.txt

DMGSkeleton/ThaiFoodStudy/Documents

DMGSkeleton/ThaiFoodStudy/Documents/DianeS.txt

DMGSkeleton/ThaiFoodStudy/Documents/alicez.txt

DMGSkeleton/ThaiFoodStudy/Documents/annmarief.txt

DMGSkeleton/ThaiFoodStudy/Documents/catl.txt

DMGSkeleton/ThaiFoodStudy/Documents/christiang.txt

DMGSkeleton/ThaiFoodStudy/Documents/danc.txt

DMGSkeleton/ThaiFoodStudy/Documents/daveh.txt

DMGSkeleton/ThaiFoodStudy/Documents/ejc.txt

DMGSkeleton/ThaiFoodStudy/Documents/frankl.txt

DMGSkeleton/ThaiFoodStudy/Documents/georget.txt

DMGSkeleton/ThaiFoodStudy/Documents/geraldinek.txt

DMGSkeleton/ThaiFoodStudy/Documents/id.txt

DMGSkeleton/ThaiFoodStudy/Documents/jessicar.txt

DMGSkeleton/ThaiFoodStudy/Documents/johannaw.txt

DMGSkeleton/ThaiFoodStudy/Documents/js.txt

DMGSkeleton/ThaiFoodStudy/Documents/kellyz.txt

DMGSkeleton/ThaiFoodStudy/Documents/kevint.txt

DMGSkeleton/ThaiFoodStudy/Documents/laurenk.txt

DMGSkeleton/ThaiFoodStudy/Documents/manifreds.txt

DMGSkeleton/ThaiFoodStudy/Documents/melissaw.txt

DMGSkeleton/ThaiFoodStudy/Documents/richardr.txt

DMGSkeleton/ThaiFoodStudy/Documents/ronu.txt

DMGSkeleton/ThaiFoodStudy/Documents/ruthp.txt

DMGSkeleton/ThaiFoodStudy/Documents/sandrac.txt

DMGSkeleton/ThaiFoodStudy/Documents/shannond.txt

DMGSkeleton/ThaiFoodStudy/Documents/sos.txt

DMGSkeleton/ThaiFoodStudy/Documents/thomasc.txt

DMGSkeleton/ThaiFoodStudy/Documents/timg.txt

DMGSkeleton/ThaiFoodStudy/Documents/tonys.txt

DMGSkeleton/ThaiFoodStudy/Documents/tracyb.txt

DMGSkeleton/ThaiFoodStudy/Documents/yanz.txt

DMGSkeleton/ThaiFoodStudy/Matrices

DMGSkeleton/ThaiFoodStudy/Results

DMGSkeleton/ThaiFoodStudy/settings.json

luminoso/lib/standalone_nlp

luminoso/lib/standalone_nlp/__init__.py

luminoso/lib/standalone_nlp/euro.py

luminoso/lib/standalone_nlp/lang_en.py

luminoso/lib/standalone_nlp/local_unpickle.py

luminoso/lib/standalone_nlp/make_standalone.py

luminoso/lib/standalone_nlp/trie.py

files modified:
ThaiFoodStudy/settings.json

luminoso/study.py

mac_build.sh

requirements.txt

setup.py

Show diffs side-by-side

added added

removed removed

luminoso/lib/standalone_nlp/euro.py

import string

from csc.nl import NLTools, get_nl, get_wordlist, get_mapping

import re

def doctest_globals():

en_nl = get_nl('en')

return locals()

class lazy_property(object):

def __init__(self, func):

'''

A lazy decorator. Runs a function only once to get a

property's value; after that, the precomputed value is used.

Replace expensive computations in __init__ with this.

'''

self.func = func

self.__name__ = func.__name__

self.__doc__ = func.__doc__

self.__dict__.update(func.__dict__)

def __get__(self, instance, cls):

assert self.__name__ not in instance.__dict__

result = instance.__dict__[self.__name__] = self.func(instance)

return result

@staticmethod

def preset(cls, name, val):

cls.__dict__[name] = val

# For .all_concepts, only include concepts where we know more than this number of things.

CUTOFF = 1

class EuroNL(NLTools):

"""

A language that generally follows our assumptions about European languages,

including:

- Words are made of uppercase and lowercase letters, which are variant

forms of each other, and apostrophes, which are kind of special.

- Words are separated by spaces or punctuation.

Only the subclasses of EuroNL -- :class:`StemmedEuroNL` and

:class:`LemmatizedEuroNL` -- implement all of the NLTools operations.

"""

# TODO: Refactor this so that stemming languages and lemmatizing languages

# aren't mixed up.

punctuation = ''.join(c for c in string.punctuation

if c not in "'-`")

def __init__(self, lang, exceptions=None):

if exceptions is None:

exceptions = {}

self.lang = lang

self.exceptions = exceptions

self.exceptions_rev = {}

for key, value in exceptions.items():

self.exceptions_rev[value] = key

@lazy_property

def blacklist(self):

return get_wordlist(self.lang, 'blacklist')

@lazy_property

def stopwords(self):

return get_wordlist(self.lang, 'stop')

@lazy_property

def frequencies(self):

from csc.nl.models import Frequency

return set([x.text for x in

Frequency.objects.filter(language__id=self.lang)])

@lazy_property

def all_concepts(self):

'''Set of all concept text strings (not model objects)'''

from csc.conceptnet.models import Concept

return set(Concept.objects.filter(language__id=self.lang, num_assertions__gt=CUTOFF).values_list('text', flat=True))

@lazy_property

def swapdict(self):

return get_mapping(self.lang, 'swap4')

@lazy_property

def autocorrect(self):

return get_mapping(self.lang, 'autocorrect')

def tokenize(self, text):

r"""

Tokenizing a sentence inserts spaces in such a way that it separates

punctuation from words, splits up contractions, and generally does what

a lot of natural language tools (especially parsers) expect their

input to do.

>>> en_nl.tokenize("Time is an illusion. Lunchtime, doubly so.")

'Time is an illusion . Lunchtime , doubly so .'

>>> untok = '''

... "Very deep," said Arthur, "you should send that in to the

100

... Reader's Digest. They've got a page for people like you."

101

... '''

102

>>> tok = en_nl.tokenize(untok)

103

>>> tok

104

"`` Very deep , '' said Arthur , `` you should send that in to the Reader 's Digest . They 've got a page for people like you . ''"

105

>>> en_nl.untokenize(tok)

106

'"Very deep," said Arthur, "you should send that in to the Reader\'s Digest. They\'ve got a page for people like you."'

107

>>> en_nl.untokenize(tok) == untok.replace('\n', ' ').strip()

108

True

109

110

"""

111

step0 = text.replace('\r', '').replace('\n', ' ')

112

step1 = step0.replace(" '", " ` ").replace("'", " '").replace("n 't",

113

" n't").replace("cannot", "can not")

114

step2 = re.sub('"([^"]*)"', r" `` \1 '' ", step1)

115

step3 = re.sub(r'([.,:;?!%]+) ', r" \1 ", step2)

116

step4 = re.sub(r'([.,:;?!%]+)$', r" \1", step3)

117

step5 = re.sub(r'([()])', r" \1 ", step4)

118

return re.sub(r' +', ' ', step5).strip()

119

120

def untokenize(self, text):

121

"""

122

Untokenizing a text undoes the tokenizing operation, restoring

123

punctuation and spaces to the places that people expect them to be.

124

125

Ideally, `untokenize(tokenize(text))` should be identical to `text`,

126

except for line breaks.

127

"""

128

step1 = text.replace("`` ", '"').replace(" ''", '"')

129

step2 = step1.replace(" ( ", " (").replace(" ) ", ") ")

130

step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2)

131

step4 = re.sub(r' ([.,:;?!%]+)$', r"\1", step3)

132

step5 = step4.replace(" '", "'").replace(" n't", "n't").replace(

133

"can not", "cannot")

134

step6 = step5.replace(" ` ", " '")

135

return step6.strip()

136

137

def canonicalize(self, word):

138

"""

139

Reduce equivalent characters to a canonical form.

140

141

In a EuroNL, by default, this puts those characters in lowercase.

142

"""

143

return word.lower()

144

145

def is_stopword(self, word):

146

"""

147

A *stopword* is a word that contributes little to the semantic meaning

148

of a text and should be ignored. These tend to be short, common words

149

such as "of", "the", and "you".

150

151

Stopwords are often members of closed classes such as articles and

152

prepositions.

153

154

Whether a word is a stopword or not is a judgement call that depends on

155

the application. In ConceptNet, we began with the stock lists of

156

stopwords from NLTK, but we have refined and tweaked the lists

157

(especially in English) over the years.

158

159

Examples::

160

161

>>> en_nl.is_stopword('the')

162

True

163

>>> en_nl.is_stopword('THE')

164

True

165

>>> en_nl.is_stopword('defenestrate')

166

False

167

168

>>> pt_nl = get_nl('pt') # This time, in Portuguese

169

>>> pt_nl.is_stopword('os')

170

True

171

>>> pt_nl.is_stopword('the')

172

False

173

"""

174

return self.canonicalize(word) in self.stopwords

175

176

def is_blacklisted(self, text):

177

"""

178

The blacklist is used to discover and discard particularly unhelpful

179

phrases.

180

181

A phrase is considered "blacklisted" if *every* word in it appears on

182

the blacklist. The empty string is always blacklisted.

183

184

>>> en_nl.is_blacklisted('x')

185

True

186

>>> en_nl.is_blacklisted('the')

187

False

188

>>> en_nl.is_blacklisted('a b c d')

189

True

190

>>> en_nl.is_blacklisted('a b c d puppies')

191

False

192

193

"""

194

if not isinstance(text, unicode): text = text.decode('utf-8')

195

words = self.tokenize(text).split(' ')

196

for word in words:

197

if self.canonicalize(word) not in self.blacklist: return False

198

return True

199

200

def is_frequency(self, word):

201

"""

202

Return whether this word represents a frequency.

203

204

>>> en_nl = get_nl('en')

205

>>> en_nl.is_frequency('sometimes')

206

True

207

>>> en_nl.is_frequency('somewhere')

208

False

209

210

>>> es_nl = get_nl('es') # This time, in Spanish

211

>>> es_nl.is_frequency('nunca')

212

True

213

>>> es_nl.is_frequency('never')

214

False

215

216

"""

217

return self.canonicalize(word) in self.frequencies

218

219

def get_frequency(self, text):

220

"""

221

If the text contains a frequency, return it. The first frequency that

222

occurs takes precedence, if there are multiple.

223

224

>>> en_nl.get_frequency('Never trust a skinny chef.')

225

u'never'

226

>>> en_nl.get_frequency('This statement is true.')

227

>>> en_nl.get_frequency('This statement is not always true.')

228

u'not'

229

230

"""

231

if not isinstance(text, unicode): text = text.decode('utf-8')

232

words = self.tokenize(text).split(' ')

233

for word in words:

234

if self.canonicalize(word) in self.frequencies:

235

return self.canonicalize(word)

236

return None

237

238

def get_words(self, text, strip_stopwords=False):

239

'''

240

Given a sentence, split it into words, stripping punctuation etc.

241

'''

242

text = self.tokenize(text)

243

punct = self.punctuation

244

words = text.replace('/', ' ').split()

245

words = (w.strip(punct).lower() for w in words)

246

words = (self.autocorrect.get(word, word) for word in words if word)

247

if strip_stopwords:

248

words = (word for word in words if not self.is_stopword(word))

249

return list(words)

250

251

def get_windows(self, words, window_size=2, join_words=True):

252

"""

253

Extract windows from the list of words.

254

255

>>> en_nl.get_windows(['sit', 'on', 'couches'], window_size=1)

256

['sit', 'on', 'couches']

257

>>> en_nl.get_windows(['sit', 'on', 'couches'], window_size=2)

258

['sit on', 'sit', 'on couches', 'on', 'couches']

259

>>> en_nl.get_windows(['sit', 'on', 'couches'], window_size=3)

260

['sit on couches', 'sit on', 'sit', 'on couches', 'on', 'couches']

261

>>> en_nl.get_windows(['sit', 'on', 'couches'], window_size=2, join_words=False)

262

[['sit', 'on'], ['sit'], ['on', 'couches'], ['on'], ['couches']]

263

"""

264

nwords = len(words)

265

windows = (words[i:i+wsize]

266

for i in xrange(nwords)

267

for wsize in xrange(min(window_size, nwords-i), 0, -1))

268

if join_words:

269

return [' '.join(window) for window in windows]

270

else:

271

return list(windows)

272

273

def extract_concepts(self, text, max_words=2, check_conceptnet=False, also_allow=[]):

274

"""

275

Extract a list of the concepts that are directly present in ``text``.

276

277

``max_words`` specifies the maximum number of words in the concept.

278

279

If ``check_conceptnet`` is True, only concepts that are in

280

ConceptNet for this language will be returned. ``also_allow``

281

is a list or set of concepts that are additionally allowed.

282

283

>>> en_nl.extract_concepts('People can be eating glimlings.', max_words=1, check_conceptnet=False)

284

[u'person', u'eat', u'glimling']

285

>>> en_nl.extract_concepts('People can be eating glimlings.', max_words=1, check_conceptnet=True)

286

[u'person', u'eat']

287

>>> en_nl.extract_concepts('People can be eating rice.', max_words=2, check_conceptnet=True)

288

[u'person eat', u'person', u'eat rice', u'eat', u'rice']

289

"""

290

words = self.normalize(text).split()

291

windows = self.get_windows(words, window_size=max_words)

292

if check_conceptnet:

293

return [concept for concept in windows

294

if concept in self.all_concepts

295

or concept in also_allow]

296

else:

297

return windows

298

299

300

class LemmatizedEuroNL(EuroNL):

301

@property

302

def lemmatizer(self):

303

"""

304

The `.lemmatizer` property lazily loads an MBLEM lemmatizer from the

305

disk. The resulting object is an instance of

306

:class:`csc.nl.mblem.trie.Trie`.

307

"""

308

if not hasattr(self, '_lemmatizer'):

309

from csc.nl.mblem import get_mblem

310

self._lemmatizer = get_mblem(self.lang)

311

return self._lemmatizer

312

313

@property

314

def unlemmatizer(self):

315

"""

316

The `.unlemmatizer` property lazily loads an MBLEM unlemmatizer from

317

the disk. The resulting object is a dictionary of tries, one for each

318

possible combination of part-of-speech and inflection that can be

319

added.

320

"""

321

if not hasattr(self, '_unlemmatizer'):

322

from csc.nl.mblem import get_unlem

323

self._unlemmatizer = get_unlem(self.lang)

324

return self._unlemmatizer

325

326

def word_split(self, word):

327

"""

328

Divide a single word into a string representing its *lemma form* (its

329

base form without inflections), and a second string representing the

330

inflections that were removed.

331

332

Instead of abstract symbols for the inflection, we currently represent

333

inflections as their most common natural language string. For example,

334

the inflection string 's' represents both "plural" and "third-person

335

singular".

336

337

This odd representation basically makes the assumption that, when two

338

inflections look the same, they will act the same on any word. Thus, we

339

can avoid trying to disambiguate different inflections when they will

340

never make a difference. (There are cases where this is not technically

341

correct, such as "leafs/leaves" in "there were leaves on the ground"

342

versus "he leafs through the pages", but we don't lose sleep over it.)

343

344

>>> en_nl.word_split(u'lemmatizing')

345

(u'lemmatize', u'ing')

346

>>> en_nl.word_split(u'cow')

347

(u'cow', u'')

348

>>> en_nl.word_split(u'went')

349

(u'go', u'ed')

350

>>> en_nl.word_split(u'people')

351

(u'person', u's')

352

"""

353

if word in self.exceptions:

354

return self.exceptions[word]

355

try:

356

lemma, pos, infl = self.lemmatizer.mblem(word)[0]

357

residue = self.unlemmatizer[pos, infl].leaves()[0].add

358

return (lemma, residue)

359

except IndexError:

360

return (word, u'')

361

362

def lemma_split(self, text, keep_stopwords=False):

363

"""

364

When you *lemma split* or *lemma factor* a string, you get two strings

365

back:

366

367

1. The *normal form*, a string containing all the lemmas of the

368

non-stopwords in the string.

369

2. The *residue*, a string containing all the stopwords and the

370

inflections that were removed.

371

372

These two strings can be recombined with :meth:`lemma_combine`.

373

374

>>> en_nl.lemma_split("This is the testiest test that ever was tested")

375

(u'testy test ever test', u'this is the 1iest 2 that 3 was 4ed')

376

"""

377

if not isinstance(text, unicode): text = text.decode('utf-8')

378

text = self.tokenize(text)

379

punct = string.punctuation.replace("'", "").replace('-',

380

'').replace("`", "")

381

382

words = text.replace('/', ' ').split()

383

words = [w.strip(punct).lower() for w in words]

384

words = [self.autocorrect.get(word, word) for word in words if word]

385

lemma_tuples = [self.word_split(word) for word in words]

386

lemmas_pre = []

387

residue_pre = []

388

lemma_index = 0

389

for i in range(len(words)):

390

if not keep_stopwords and words[i] in self.stopwords:

391

residue_pre.append((None, words[i]))

392

else:

393

lemmas_pre.append((lemma_tuples[i][0], lemma_index))

394

residue_pre.append((lemma_index, lemma_tuples[i][1]))

395

lemma_index += 1

396

#lemmas_pre.sort()

397

permute = [l[1] for l in lemmas_pre]

398

invpermute = [permute.index(i) for i in range(len(permute))]

399

lemmas = [l[0] for l in lemmas_pre]

400

lemmas = [self.swapdict.get(lemma, lemma) for lemma in lemmas]

401

402

residue = []

403

for lemma_index, ltext in residue_pre:

404

if lemma_index is None: residue.append(ltext)

405

else: residue.append(str(invpermute[lemma_index]+1) + ltext)

406

if len(lemmas) == 0 and not keep_stopwords:

407

return self.lemma_split(text, keep_stopwords=True)

408

return (u' '.join(lemmas), u' '.join(residue))

409

lemma_factor = lemma_split

410

411

def normalize(self, text):

412

"""

413

When you *normalize* a string (no relation to the operation of

414

normalizing a vector), you remove its stopwords and inflections so that

415

it becomes equivalent to similar strings.

416

417

Normalizing involves running :meth:`lemma_split` and keeping only the

418

first factor, thus discarding the information that would be used to

419

reconstruct the full string.

420

421

>>> en_nl.normalize("This is the testiest test that ever was tested")

422

u'testy test ever test'

423

"""

424

return self.lemma_split(text)[0]

425

normalize4 = normalize

426

427

def lemma_combine(self, lemmas, residue):

428

"""

429

This is the inverse of :meth:`lemma_factor` -- it takes in a normal

430

form and a residue, and re-assembles them into a phrase that is

431

hopefully comprehensible.

432

433

>>> en_nl.lemma_combine(u'testy test ever test',

434

... u'this is the 1iest 2 that 3 was 4ed')

435

u'this is the testiest test that ever was tested'

436

>>> en_nl.lemma_combine(u'person', u'1s')

437

u'people'

438

"""

439

words = []

440

lemmas = lemmas.split(' ')

441

for res in residue.split(' '):

442

if res and res[0] in '0123456789':

443

numstr, pos, infl = self.lemmatizer.mblem(res)[0]

444

while numstr[-1] not in '0123456789': numstr = numstr[:-1]

445

rest = res[len(numstr):]

446

num = int(numstr)

447

lemma = lemmas[num-1]

448

if (lemma, rest) in self.exceptions_rev:

449

words.append(self.exceptions_rev[(lemma, rest)])

450

else:

451

inflected = self.unlemmatizer[pos, infl].unlem(lemma)[0]

452

words.append(inflected)

453

else:

454

words.append(res)

455

return self.untokenize(' '.join(words))

456

457

class StemmedEuroNL(EuroNL):

458

@property

459

def stemmer(self):

460

if not hasattr(self, '_stemmer'):

461

from Stemmer import Stemmer

462

self._stemmer = Stemmer(self.lang)

463

return self._stemmer

464

465

def stem_word(self, word):

466

return self.stemmer.stemWord(word)

467

468

def word_split(self, word):

469

stem = self.stem_word(word)

470

residue = word[len(stem):]

471

return (stem, residue)

472

473

def is_stopword(self, word):

474

return word in self.stopwords

475

476

def normalize(self, text):

477

if not isinstance(text, unicode): text = text.decode('utf-8')

478

punct = string.punctuation.replace("'", "")

479

words = text.replace('/', ' ').replace('-', ' ').split()

480

words = [w.strip(punct).lower() for w in words]

481

words = [w for w in words if not self.is_stopword(w)]

482

words = [self.stem_word(w) for w in words]

483

words.sort()

484

return u" ".join(words)

485

Older »