~loggerhead-team/loggerhead/trunk

42 by Robey Pointer
add text substring indexer
1
#
2
# Copyright (C) 2006  Robey Pointer <robey@lag.net>
3
#
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation; either version 2 of the License, or
7
# (at your option) any later version.
8
#
9
# This program is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with this program; if not, write to the Free Software
16
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
#
18
19
"""
20
indexing of the comment text of revisions, for fast searching.
21
128.4.8 by Michael Hudson
remove storm dependency (it'll be back)
22
two separate database files are created:
42 by Robey Pointer
add text substring indexer
23
24
    - recorded: revid -> 1 (if the revid is indexed)
25
    - index: 3-letter substring -> list(revids)
26
"""
27
28
import os
29
import re
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
30
import time
42 by Robey Pointer
add text substring indexer
31
32
from loggerhead import util
69 by Robey Pointer
switch the cache and text index to use file locking so they can be used by
33
from loggerhead.lockfile import LockFile
128.4.7 by Michael Hudson
use my sqlite not-shelf for the textindex too. very very slow to fill, but no more bdb!
34
from loggerhead.changecache import FakeShelf
42 by Robey Pointer
add text substring indexer
35
36
# if any substring index reaches this many revids, replace the entry with
37
# an ALL marker -- it's not worth an explicit index.
38
ALL_THRESHOLD = 1000
39
ALL = 'ALL'
40
41
48 by Robey Pointer
the big migration of branch-specific data to a BranchView object: actually
42
with_lock = util.with_lock('_lock')
42 by Robey Pointer
add text substring indexer
43
44
45
def normalize_string(s):
46
    """
47
    remove any punctuation and normalize all whitespace to a single space.
48
    """
49
    s = util.to_utf8(s).lower()
50
    # remove apostrophes completely.
51
    s = re.sub(r"'", '', s)
52
    # convert other garbage into space
53
    s = re.sub(r'[^\w\d]', ' ', s)
54
    # compress multiple spaces into one.
55
    s = re.sub(r'\s{2,}', ' ', s)
56
    # and finally remove leading/trailing whitespace
57
    s = s.strip()
58
    return s
59
60
61
class TextIndex (object):
62
    def __init__(self, history, cache_path):
63
        self.history = history
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
64
        self.log = history.log
144.1.10 by Michael Hudson
run reindent.py over the loggerhead package
65
42 by Robey Pointer
add text substring indexer
66
        if not os.path.exists(cache_path):
67
            os.mkdir(cache_path)
144.1.10 by Michael Hudson
run reindent.py over the loggerhead package
68
128.4.7 by Michael Hudson
use my sqlite not-shelf for the textindex too. very very slow to fill, but no more bdb!
69
        self._recorded_filename = os.path.join(cache_path, 'textindex-recorded.sql')
70
        self._index_filename = os.path.join(cache_path, 'textindex.sql')
144.1.10 by Michael Hudson
run reindent.py over the loggerhead package
71
69 by Robey Pointer
switch the cache and text index to use file locking so they can be used by
72
        # use a lockfile since the cache folder could be shared across different processes.
73
        self._lock = LockFile(os.path.join(cache_path, 'index-lock'))
74
        self._closed = False
144.1.10 by Michael Hudson
run reindent.py over the loggerhead package
75
73 by Robey Pointer
heh, duh. i can't leave the shelf files open from multiple threads at once.
76
        self.log.info('Using search index; %d entries.', len(self))
128.4.7 by Michael Hudson
use my sqlite not-shelf for the textindex too. very very slow to fill, but no more bdb!
77
78
    def _index(self):
79
        return FakeShelf(self._index_filename)
80
81
    def _recorded(self):
82
        return FakeShelf(self._recorded_filename)
144.1.10 by Michael Hudson
run reindent.py over the loggerhead package
83
73 by Robey Pointer
heh, duh. i can't leave the shelf files open from multiple threads at once.
84
    def _is_indexed(self, revid, recorded):
128.4.7 by Michael Hudson
use my sqlite not-shelf for the textindex too. very very slow to fill, but no more bdb!
85
        return recorded.get(util.to_utf8(revid)) is not None
144.1.10 by Michael Hudson
run reindent.py over the loggerhead package
86
42 by Robey Pointer
add text substring indexer
87
    @with_lock
88
    def is_indexed(self, revid):
128.4.7 by Michael Hudson
use my sqlite not-shelf for the textindex too. very very slow to fill, but no more bdb!
89
        recorded = self._recorded()
73 by Robey Pointer
heh, duh. i can't leave the shelf files open from multiple threads at once.
90
        try:
91
            return self._is_indexed(revid, recorded)
92
        finally:
93
            recorded.close()
144.1.10 by Michael Hudson
run reindent.py over the loggerhead package
94
42 by Robey Pointer
add text substring indexer
95
    @with_lock
96
    def __len__(self):
128.4.7 by Michael Hudson
use my sqlite not-shelf for the textindex too. very very slow to fill, but no more bdb!
97
        recorded = self._recorded()
73 by Robey Pointer
heh, duh. i can't leave the shelf files open from multiple threads at once.
98
        try:
128.4.7 by Michael Hudson
use my sqlite not-shelf for the textindex too. very very slow to fill, but no more bdb!
99
            return recorded.count()
73 by Robey Pointer
heh, duh. i can't leave the shelf files open from multiple threads at once.
100
        finally:
101
            recorded.close()
42 by Robey Pointer
add text substring indexer
102
103
    @with_lock
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
104
    def close(self):
69 by Robey Pointer
switch the cache and text index to use file locking so they can be used by
105
        self._closed = True
144.1.10 by Michael Hudson
run reindent.py over the loggerhead package
106
69 by Robey Pointer
switch the cache and text index to use file locking so they can be used by
107
    @with_lock
108
    def closed(self):
109
        return self._closed
144.1.10 by Michael Hudson
run reindent.py over the loggerhead package
110
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
111
    @with_lock
42 by Robey Pointer
add text substring indexer
112
    def flush(self):
73 by Robey Pointer
heh, duh. i can't leave the shelf files open from multiple threads at once.
113
        pass
144.1.10 by Michael Hudson
run reindent.py over the loggerhead package
114
42 by Robey Pointer
add text substring indexer
115
    @with_lock
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
116
    def full(self):
128.4.7 by Michael Hudson
use my sqlite not-shelf for the textindex too. very very slow to fill, but no more bdb!
117
        recorded = self._recorded()
118
        last_revid = util.to_utf8(self.history.last_revid)
73 by Robey Pointer
heh, duh. i can't leave the shelf files open from multiple threads at once.
119
        try:
128.4.7 by Michael Hudson
use my sqlite not-shelf for the textindex too. very very slow to fill, but no more bdb!
120
            return (recorded.count() >= len(self.history.get_revision_history())
121
                    and recorded.get(last_revid) is not None)
73 by Robey Pointer
heh, duh. i can't leave the shelf files open from multiple threads at once.
122
        finally:
123
            recorded.close()
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
124
73 by Robey Pointer
heh, duh. i can't leave the shelf files open from multiple threads at once.
125
    def _index_change(self, change, recorded, index):
42 by Robey Pointer
add text substring indexer
126
        """
127
        currently, only indexes the 'comment' field.
128
        """
129
        comment = normalize_string(change.comment)
130
        if len(comment) < 3:
131
            return
132
        for i in xrange(len(comment) - 2):
133
            sub = comment[i:i + 3]
128.4.7 by Michael Hudson
use my sqlite not-shelf for the textindex too. very very slow to fill, but no more bdb!
134
            orig = revid_set = index.get(sub)
42 by Robey Pointer
add text substring indexer
135
            if revid_set is None:
136
                revid_set = set()
137
            elif revid_set == ALL:
138
                # this entry got too big
139
                continue
140
            revid_set.add(change.revid)
141
            if len(revid_set) > ALL_THRESHOLD:
142
                revid_set = ALL
128.4.7 by Michael Hudson
use my sqlite not-shelf for the textindex too. very very slow to fill, but no more bdb!
143
            if orig is not None:
128.4.10 by Michael Hudson
don't commit so often when building the textindex search
144
                index.update([(sub, revid_set)], commit=False)
128.4.7 by Michael Hudson
use my sqlite not-shelf for the textindex too. very very slow to fill, but no more bdb!
145
            else:
128.4.10 by Michael Hudson
don't commit so often when building the textindex search
146
                index.add([(sub, revid_set)], commit=False)
128.4.7 by Michael Hudson
use my sqlite not-shelf for the textindex too. very very slow to fill, but no more bdb!
147
128.4.10 by Michael Hudson
don't commit so often when building the textindex search
148
        recorded.add([(util.to_utf8(change.revid), True)], commit=False)
73 by Robey Pointer
heh, duh. i can't leave the shelf files open from multiple threads at once.
149
150
    @with_lock
151
    def index_changes(self, revid_list):
128.4.7 by Michael Hudson
use my sqlite not-shelf for the textindex too. very very slow to fill, but no more bdb!
152
        recorded = self._recorded()
153
        index = self._index()
73 by Robey Pointer
heh, duh. i can't leave the shelf files open from multiple threads at once.
154
        try:
155
            revid_list = [r for r in revid_list if not self._is_indexed(r, recorded)]
156
            change_list = self.history.get_changes(revid_list)
157
            for change in change_list:
158
                self._index_change(change, recorded, index)
159
        finally:
128.4.10 by Michael Hudson
don't commit so often when building the textindex search
160
            index.close(commit=True)
161
            recorded.close(commit=True)
144.1.10 by Michael Hudson
run reindent.py over the loggerhead package
162
42 by Robey Pointer
add text substring indexer
163
    @with_lock
164
    def find(self, text, revid_list=None):
128.4.7 by Michael Hudson
use my sqlite not-shelf for the textindex too. very very slow to fill, but no more bdb!
165
        index = self._index()
73 by Robey Pointer
heh, duh. i can't leave the shelf files open from multiple threads at once.
166
        try:
167
            text = normalize_string(text)
168
            if len(text) < 3:
169
                return []
144.1.10 by Michael Hudson
run reindent.py over the loggerhead package
170
73 by Robey Pointer
heh, duh. i can't leave the shelf files open from multiple threads at once.
171
            total_set = None
172
            if revid_list is not None:
173
                total_set = set(revid_list)
174
            seen_all = False
144.1.10 by Michael Hudson
run reindent.py over the loggerhead package
175
73 by Robey Pointer
heh, duh. i can't leave the shelf files open from multiple threads at once.
176
            for i in xrange(len(text) - 2):
177
                sub = text[i:i + 3]
128.4.7 by Michael Hudson
use my sqlite not-shelf for the textindex too. very very slow to fill, but no more bdb!
178
                revid_set = index.get(sub)
73 by Robey Pointer
heh, duh. i can't leave the shelf files open from multiple threads at once.
179
                if revid_set is None:
180
                    # zero matches, stop here.
181
                    return []
182
                if revid_set == ALL:
183
                    # skip
184
                    seen_all = True
185
                    continue
186
                if total_set is None:
187
                    total_set = revid_set
188
                else:
189
                    total_set.intersection_update(revid_set)
190
                if len(total_set) == 0:
191
                    return []
192
        finally:
193
            index.close()
144.1.10 by Michael Hudson
run reindent.py over the loggerhead package
194
42 by Robey Pointer
add text substring indexer
195
        # tricky: if seen_all is True, one of the substring indices was ALL
196
        # (in other words, unindexed), so our results are actually a superset
197
        # of the exact answer.
198
        #
199
        # if we cared, we could do a direct match on the result set and cull
200
        # out any that aren't actually matches.  for now, i'm gonna say that
201
        # we DON'T care, and if one of the substrings hit ALL, there's a small
69 by Robey Pointer
switch the cache and text index to use file locking so they can be used by
202
        # chance that we'll give a few false positives.
42 by Robey Pointer
add text substring indexer
203
        return total_set
144.1.10 by Michael Hudson
run reindent.py over the loggerhead package
204
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
205
    def check_rebuild(self, max_time=3600):
206
        """
207
        check if there are any un-indexed revisions, and if so, index them.
208
        but don't spend longer than C{max_time} on it.
209
        """
69 by Robey Pointer
switch the cache and text index to use file locking so they can be used by
210
        if self.closed() or self.full():
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
211
            # all done
212
            return
213
214
        self.log.info('Building search index...')
215
        work = list(self.history.get_revision_history())
216
        start_time = time.time()
217
        last_update = time.time()
218
        count = 0
73 by Robey Pointer
heh, duh. i can't leave the shelf files open from multiple threads at once.
219
220
        jump = 100
221
        for i in xrange(0, len(work), jump):
222
            r = work[i:i + jump]
223
            self.index_changes(r)
69 by Robey Pointer
switch the cache and text index to use file locking so they can be used by
224
            if self.closed():
225
                return
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
226
73 by Robey Pointer
heh, duh. i can't leave the shelf files open from multiple threads at once.
227
            count += jump
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
228
            now = time.time()
229
            if now - start_time > 3600:
230
                # there's no point working for hours.  eventually we might even
231
                # hit the next re-index interval, which would suck mightily.
232
                self.log.info('Search indexing has worked for an hour; giving up for now.')
233
                return
234
            if now - last_update > 60:
235
                self.log.info('Search indexing continues: %d/%d' % (min(count, len(work)), len(work)))
236
                last_update = time.time()
71 by Robey Pointer
exponential backoff isn't really working for the lockfile, so keep it down
237
            # give someone else a chance at the lock
238
            time.sleep(1)
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
239
        self.log.info('Search index completed.')
48 by Robey Pointer
the big migration of branch-specific data to a BranchView object: actually
240
        self.flush()