1
# This module is part of the Lupy project and is Copyright 2003 Amir
2
# Bakhtiar (amir@divmod.org). This is free software; you can redistribute
3
# it and/or modify it under the terms of version 2.1 of the GNU Lesser
4
# General Public License as published by the Free Software Foundation.
6
from StringIO import StringIO
7
from array import array
9
from MoinMoin.support.lupy.search import similarity
10
from MoinMoin.support.lupy.index import field, term
12
def standardTokenizer(string):
13
"""Yield a stream of downcased words from a string."""
14
r = re.compile("\\w+", re.U)
15
tokenstream = re.finditer(r, string)
17
yield m.group().lower()
19
class DocumentWriter(object):
21
def __init__(self, directory, analyzer=None, mfl=None):
22
self.directory = directory
23
self.maxFieldLength = mfl
24
self.postingTable = {}
25
self.termBuffer = term.Term('','')
26
self.analyzer=analyzer or standardTokenizer
28
def addDocument(self, segment, doc):
30
fi = self.fieldInfos = field.FieldInfos()
32
fi.writeDir(self.directory, segment + '.fnm')
35
fieldsWriter = field.FieldsWriter(self.directory,
39
fieldsWriter.addDocument(doc)
43
# Invert doc into postingTable
44
self.postingTable = {}
45
self.fieldLengths = [0] * (len(self.fieldInfos))
46
self.invertDocument(doc)
48
# Sort postingTable into an array
49
postings = self.sortPostingTable()
53
self.writePostings(postings, segment)
55
# Write noms of indexed files
56
self.writeNorms(doc, segment)
59
def invertDocument(self, doc):
61
for field in doc.fields():
62
fieldName = field.name()
63
fieldNumber = self.fieldInfos.fieldNumber(fieldName)
65
position = self.fieldLengths[fieldNumber] # Position in field
68
if not field.isTokenized:
70
self.addPosition(fieldName, field.stringValue(), position)
73
# Find or make a reader
74
if field.readerValue() is not None:
75
val = field.readerValue().read()
76
elif field.stringValue() is not None:
77
val = field.stringValue()
79
raise Exception, 'Field must have either a String or Reader value'
81
for tok in self.analyzer(val):
82
self.addPosition(fieldName, tok, position)
85
if self.maxFieldLength and (position > self.maxFieldLength):
88
self.fieldLengths[fieldNumber] = position
91
def addPosition(self, field, text, position):
92
self.termBuffer.set(field, text)
94
ti = self.postingTable.get(self.termBuffer, None)
98
ti.positions.append(position)
101
trm = term.Term(field, text, False)
102
self.postingTable[trm] = Posting(trm, position)
105
def sortPostingTable(self):
106
arr = self.postingTable.values()
111
def writePostings(self, postings, segment):
117
freq = self.directory.createFile(segment + '.frq')
118
prox = self.directory.createFile(segment + '.prx')
120
tis = term.TermInfosWriter(self.directory,
125
for posting in postings:
126
# print 'writing', posting, posting.term
127
# Add entry to the dictionary with pointers to prox and freq files
128
ti.set(1, freq.getFilePointer(), prox.getFilePointer())
129
tis.add(posting.term, ti)
131
# Add an entry to the freq file
133
if f == 1: # optimize freq == 1
134
freq.writeVInt(1) # set low bit of doc num
136
freq.writeVInt(0) # the document number
137
freq.writeVInt(f) # frequency in doc
140
positions = posting.positions
142
for position in positions:
143
prox.writeVInt(position - lastPosition)
144
lastPosition = position
155
def writeNorms(self, doc, segment):
156
for field in doc.fields():
158
fieldNumber = self.fieldInfos.fieldNumber(field.name())
159
norm = self.directory.createFile(segment +
160
'.f' + str(fieldNumber))
162
norm.writeByte(similarity.normInt(self.fieldLengths[fieldNumber]))
167
class Posting(object):
169
def __init__(self, t, position):
172
self.positions = array('i',[1])
173
self.positions[0] = position
177
s += str(self.term) + '>'
180
def __cmp__(self, other):
181
return cmp(self.term, other.term)