~ubuntu-branches/ubuntu/natty/moin/natty-updates

« back to all changes in this revision

Viewing changes to MoinMoin/support/lupy/index/documentwriter.py

  • Committer: Bazaar Package Importer
  • Author(s): Jonas Smedegaard
  • Date: 2008-06-22 21:17:13 UTC
  • mfrom: (0.9.1 upstream)
  • Revision ID: james.westby@ubuntu.com-20080622211713-fpo2zrq3s5dfecxg
Tags: 1.7.0-3
Simplify /etc/moin/wikilist format: "USER URL" (drop unneeded middle
CONFIG_DIR that was wrongly advertised as DATA_DIR).  Make
moin-mass-migrate handle both formats and warn about deprecation of
the old one.

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
 
# This module is part of the Lupy project and is Copyright 2003 Amir
2
 
# Bakhtiar (amir@divmod.org). This is free software; you can redistribute
3
 
# it and/or modify it under the terms of version 2.1 of the GNU Lesser
4
 
# General Public License as published by the Free Software Foundation.
5
 
 
6
 
from StringIO import StringIO
7
 
from array import array
8
 
import re
9
 
from MoinMoin.support.lupy.search import similarity
10
 
from MoinMoin.support.lupy.index import field, term
11
 
 
12
 
def standardTokenizer(string):
13
 
    """Yield a stream of downcased words from a string."""
14
 
    r = re.compile("\\w+", re.U)
15
 
    tokenstream = re.finditer(r, string)
16
 
    for m in tokenstream:
17
 
        yield m.group().lower()
18
 
        
19
 
class DocumentWriter(object):
20
 
 
21
 
    def __init__(self, directory, analyzer=None, mfl=None):
22
 
        self.directory = directory
23
 
        self.maxFieldLength = mfl
24
 
        self.postingTable = {}
25
 
        self.termBuffer = term.Term('','')
26
 
        self.analyzer=analyzer or standardTokenizer
27
 
        
28
 
    def addDocument(self, segment, doc):
29
 
        # Write field names
30
 
        fi = self.fieldInfos = field.FieldInfos()
31
 
        fi.add(doc)
32
 
        fi.writeDir(self.directory, segment + '.fnm')
33
 
 
34
 
        # Write field values
35
 
        fieldsWriter = field.FieldsWriter(self.directory,
36
 
                                                 segment,
37
 
                                                 self.fieldInfos)
38
 
        try:
39
 
            fieldsWriter.addDocument(doc)
40
 
        finally:
41
 
            fieldsWriter.close()
42
 
 
43
 
        # Invert doc into postingTable
44
 
        self.postingTable = {}
45
 
        self.fieldLengths = [0] * (len(self.fieldInfos))
46
 
        self.invertDocument(doc)
47
 
 
48
 
        # Sort postingTable into an array
49
 
        postings = self.sortPostingTable()
50
 
 
51
 
 
52
 
        # Write postings
53
 
        self.writePostings(postings, segment)
54
 
        
55
 
        # Write noms of indexed files
56
 
        self.writeNorms(doc, segment)
57
 
 
58
 
 
59
 
    def invertDocument(self, doc):
60
 
        fields = doc.fields()
61
 
        for field in doc.fields():
62
 
            fieldName = field.name()
63
 
            fieldNumber = self.fieldInfos.fieldNumber(fieldName)
64
 
            
65
 
            position = self.fieldLengths[fieldNumber]    # Position in field
66
 
 
67
 
            if field.isIndexed:
68
 
                if not field.isTokenized:
69
 
                    # Untokenized
70
 
                    self.addPosition(fieldName, field.stringValue(), position)
71
 
                    position += 1
72
 
                else:
73
 
                    # Find or make a reader
74
 
                    if field.readerValue() is not None:
75
 
                        val = field.readerValue().read()
76
 
                    elif field.stringValue() is not None:
77
 
                        val = field.stringValue()
78
 
                    else:
79
 
                        raise Exception, 'Field must have either a String or Reader value'
80
 
                    
81
 
                    for tok in self.analyzer(val):
82
 
                        self.addPosition(fieldName, tok, position)
83
 
                        position += 1
84
 
 
85
 
                        if self.maxFieldLength and (position > self.maxFieldLength):
86
 
                            break
87
 
                        
88
 
            self.fieldLengths[fieldNumber] = position 
89
 
                    
90
 
 
91
 
    def addPosition(self, field, text, position):
92
 
        self.termBuffer.set(field, text)
93
 
 
94
 
        ti = self.postingTable.get(self.termBuffer, None)
95
 
        
96
 
        if ti is not None:
97
 
            freq = ti.freq
98
 
            ti.positions.append(position)
99
 
            ti.freq = freq + 1
100
 
        else:
101
 
            trm = term.Term(field, text, False)
102
 
            self.postingTable[trm] = Posting(trm, position)
103
 
 
104
 
 
105
 
    def sortPostingTable(self):
106
 
        arr = self.postingTable.values()
107
 
        arr.sort()
108
 
        return arr
109
 
 
110
 
 
111
 
    def writePostings(self, postings, segment):
112
 
        freq = None
113
 
        prox = None
114
 
        tis = None
115
 
 
116
 
        try:
117
 
            freq = self.directory.createFile(segment + '.frq')
118
 
            prox = self.directory.createFile(segment + '.prx')
119
 
 
120
 
            tis = term.TermInfosWriter(self.directory,
121
 
                                                  segment,
122
 
                                                  self.fieldInfos)
123
 
            ti = term.TermInfo()
124
 
 
125
 
            for posting in postings:
126
 
                # print 'writing', posting, posting.term
127
 
                # Add entry to the dictionary with pointers to prox and freq files
128
 
                ti.set(1, freq.getFilePointer(), prox.getFilePointer())
129
 
                tis.add(posting.term, ti)
130
 
 
131
 
                # Add an entry to the freq file
132
 
                f = posting.freq
133
 
                if f == 1:                  # optimize freq == 1
134
 
                    freq.writeVInt(1)       # set low bit of doc num
135
 
                else:
136
 
                    freq.writeVInt(0)       # the document number
137
 
                    freq.writeVInt(f)       # frequency in doc
138
 
 
139
 
                lastPosition = 0
140
 
                positions = posting.positions
141
 
 
142
 
                for position in positions:
143
 
                    prox.writeVInt(position - lastPosition)
144
 
                    lastPosition = position
145
 
                    
146
 
        finally:
147
 
            if freq is not None:
148
 
                freq.close()
149
 
            if prox is not None:
150
 
                prox.close()
151
 
            if tis is not None:
152
 
                tis.close()
153
 
 
154
 
 
155
 
    def writeNorms(self, doc, segment):
156
 
        for field in doc.fields():
157
 
            if field.isIndexed:
158
 
                fieldNumber = self.fieldInfos.fieldNumber(field.name())
159
 
                norm = self.directory.createFile(segment +
160
 
                                                 '.f' + str(fieldNumber))
161
 
                try:
162
 
                    norm.writeByte(similarity.normInt(self.fieldLengths[fieldNumber]))
163
 
                finally:
164
 
                    norm.close()
165
 
 
166
 
 
167
 
class Posting(object):
168
 
 
169
 
    def __init__(self, t, position):
170
 
        self.term = t
171
 
        self.freq = 1
172
 
        self.positions = array('i',[1])
173
 
        self.positions[0] = position
174
 
 
175
 
    def __repr__(self):
176
 
        s = '<Posting:'
177
 
        s += str(self.term) + '>'
178
 
        return s
179
 
 
180
 
    def __cmp__(self, other):
181
 
        return cmp(self.term, other.term)