~lifeless/loggerhead/squid3

47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
1
#
2
# Copyright (C) 2006  Robey Pointer <robey@lag.net>
3
#
4
# This program is free software; you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation; either version 2 of the License, or
7
# (at your option) any later version.
8
#
9
# This program is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with this program; if not, write to the Free Software
16
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
17
#
18
19
"""
20
a cache for chewed-up "change" data structures, which are basically just a
21
different way of storing a revision.  the cache improves lookup times 10x
22
over bazaar's xml revision structure, though, so currently still worth doing.
23
24
once a revision is committed in bazaar, it never changes, so once we have
25
cached a change, it's good forever.
26
"""
27
128.4.2 by Michael Hudson
rather brainlessly store the filechange cache in a sql database instead of a shelve store
28
import cPickle
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
29
import os
30
import time
31
32
from loggerhead import util
69 by Robey Pointer
switch the cache and text index to use file locking so they can be used by
33
from loggerhead.lockfile import LockFile
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
34
35
49 by Robey Pointer
add top-level page listing available branches. also a patch from matty to not require external-url in atom feeds any more
36
with_lock = util.with_lock('_lock', 'ChangeCache')
128.4.5 by Michael Hudson
reorganizations, cleanups. still utterly horrid though.
37
153 by Robert Collins
Add support for the sqlite3 bindings as a SQLITE_INTERFACE. (Robert Collins)
38
SQLITE_INTERFACE = os.environ.get('SQLITE_INTERFACE', 'sqlite3')
128.4.11 by Michael Hudson
fiddle things around so you can use the python-sqlite or python-pysqlite
39
40
if SQLITE_INTERFACE == 'pysqlite2':
41
    from pysqlite2 import dbapi2
42
    _param_marker = '?'
43
elif SQLITE_INTERFACE == 'sqlite':
44
    import sqlite as dbapi2
45
    _param_marker = '%s'
153 by Robert Collins
Add support for the sqlite3 bindings as a SQLITE_INTERFACE. (Robert Collins)
46
if SQLITE_INTERFACE == 'sqlite3':
47
    from sqlite3 import dbapi2
48
    _param_marker = '?'
128.4.11 by Michael Hudson
fiddle things around so you can use the python-sqlite or python-pysqlite
49
else:
128.4.15 by Michael Hudson
barry suggested i switch sqlite interface based on an environment variable, which makes much sense
50
    raise AssertionError("bad sqlite interface %r!?"%SQLITE_INTERFACE)
128.4.11 by Michael Hudson
fiddle things around so you can use the python-sqlite or python-pysqlite
51
52
_select_stmt = ("select data from revisiondata where revid = ?"
53
                ).replace('?', _param_marker)
54
_insert_stmt = ("insert into revisiondata (revid, data) "
55
                "values (?, ?)").replace('?', _param_marker)
56
_update_stmt = ("update revisiondata set data = ? where revid = ?"
57
                ).replace('?', _param_marker)
58
59
60
128.4.5 by Michael Hudson
reorganizations, cleanups. still utterly horrid though.
61
62
class FakeShelf(object):
128.4.6 by Michael Hudson
remove one layer of ick
63
    def __init__(self, filename):
64
        create_table = not os.path.exists(filename)
128.4.8 by Michael Hudson
remove storm dependency (it'll be back)
65
        self.connection = dbapi2.connect(filename)
128.4.11 by Michael Hudson
fiddle things around so you can use the python-sqlite or python-pysqlite
66
        self.cursor = self.connection.cursor()
128.4.5 by Michael Hudson
reorganizations, cleanups. still utterly horrid though.
67
        if create_table:
128.4.8 by Michael Hudson
remove storm dependency (it'll be back)
68
            self._create_table()
69
    def _create_table(self):
128.4.11 by Michael Hudson
fiddle things around so you can use the python-sqlite or python-pysqlite
70
        self.cursor.execute(
128.4.6 by Michael Hudson
remove one layer of ick
71
            "create table RevisionData "
128.4.9 by Michael Hudson
gar, fix problem with NULLs
72
            "(revid binary primary key, data binary)")
128.4.8 by Michael Hudson
remove storm dependency (it'll be back)
73
        self.connection.commit()
128.4.9 by Michael Hudson
gar, fix problem with NULLs
74
    def _serialize(self, obj):
75
        r = dbapi2.Binary(cPickle.dumps(obj, protocol=2))
76
        return r
77
    def _unserialize(self, data):
128.4.10 by Michael Hudson
don't commit so often when building the textindex search
78
        return cPickle.loads(str(data))
128.4.5 by Michael Hudson
reorganizations, cleanups. still utterly horrid though.
79
    def get(self, revid):
128.4.11 by Michael Hudson
fiddle things around so you can use the python-sqlite or python-pysqlite
80
        self.cursor.execute(_select_stmt, (revid,))
81
        filechange = self.cursor.fetchone()
128.4.5 by Michael Hudson
reorganizations, cleanups. still utterly horrid though.
82
        if filechange is None:
83
            return None
84
        else:
128.4.9 by Michael Hudson
gar, fix problem with NULLs
85
            return self._unserialize(filechange[0])
128.4.10 by Michael Hudson
don't commit so often when building the textindex search
86
    def add(self, revid_obj_pairs, commit=True):
128.4.11 by Michael Hudson
fiddle things around so you can use the python-sqlite or python-pysqlite
87
        for  (r, d) in revid_obj_pairs:
88
            self.cursor.execute(_insert_stmt, (r, self._serialize(d)))
128.4.10 by Michael Hudson
don't commit so often when building the textindex search
89
        if commit:
90
            self.connection.commit()
91
    def update(self, revid_obj_pairs, commit=True):
128.4.11 by Michael Hudson
fiddle things around so you can use the python-sqlite or python-pysqlite
92
        for  (r, d) in revid_obj_pairs:
93
            self.cursor.execute(_update_stmt, (self._serialize(d), r))
128.4.10 by Michael Hudson
don't commit so often when building the textindex search
94
        if commit:
95
            self.connection.commit()
128.4.5 by Michael Hudson
reorganizations, cleanups. still utterly horrid though.
96
    def count(self):
128.4.11 by Michael Hudson
fiddle things around so you can use the python-sqlite or python-pysqlite
97
        self.cursor.execute(
98
            "select count(*) from revisiondata")
99
        return self.cursor.fetchone()[0]
128.4.10 by Michael Hudson
don't commit so often when building the textindex search
100
    def close(self, commit=False):
101
        if commit:
102
            self.connection.commit()
128.4.8 by Michael Hudson
remove storm dependency (it'll be back)
103
        self.connection.close()
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
104
105
class ChangeCache (object):
128.4.10 by Michael Hudson
don't commit so often when building the textindex search
106
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
107
    def __init__(self, history, cache_path):
108
        self.history = history
109
        self.log = history.log
144.1.10 by Michael Hudson
run reindent.py over the loggerhead package
110
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
111
        if not os.path.exists(cache_path):
112
            os.mkdir(cache_path)
113
128.4.5 by Michael Hudson
reorganizations, cleanups. still utterly horrid though.
114
        self._changes_filename = os.path.join(cache_path, 'changes.sql')
128.4.4 by Michael Hudson
use the sqlite not-shelf for the change cache too
115
69 by Robey Pointer
switch the cache and text index to use file locking so they can be used by
116
        # use a lockfile since the cache folder could be shared across different processes.
117
        self._lock = LockFile(os.path.join(cache_path, 'lock'))
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
118
        self._closed = False
128.4.4 by Michael Hudson
use the sqlite not-shelf for the change cache too
119
128.4.6 by Michael Hudson
remove one layer of ick
120
##         # this is fluff; don't slow down startup time with it.
121
##         # but it is racy in tests :(
122
##         def log_sizes():
123
##             self.log.info('Using change cache %s; %d entries.' % (cache_path, self.size()))
124
##         threading.Thread(target=log_sizes).start()
128.4.5 by Michael Hudson
reorganizations, cleanups. still utterly horrid though.
125
126
    def _cache(self):
128.4.6 by Michael Hudson
remove one layer of ick
127
        return FakeShelf(self._changes_filename)
128.4.5 by Michael Hudson
reorganizations, cleanups. still utterly horrid though.
128
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
129
    @with_lock
130
    def close(self):
69 by Robey Pointer
switch the cache and text index to use file locking so they can be used by
131
        self.log.debug('Closing cache file.')
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
132
        self._closed = True
144.1.10 by Michael Hudson
run reindent.py over the loggerhead package
133
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
134
    @with_lock
135
    def closed(self):
136
        return self._closed
137
138
    @with_lock
139
    def flush(self):
73 by Robey Pointer
heh, duh. i can't leave the shelf files open from multiple threads at once.
140
        pass
144.1.10 by Michael Hudson
run reindent.py over the loggerhead package
141
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
142
    @with_lock
128.2.7 by Robey Pointer
the diff cache isn't adding very much, and can grow very large. let's just
143
    def get_changes(self, revid_list):
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
144
        """
145
        get a list of changes by their revision_ids.  any changes missing
146
        from the cache are fetched by calling L{History.get_change_uncached}
147
        and inserted into the cache before returning.
148
        """
128.4.4 by Michael Hudson
use the sqlite not-shelf for the change cache too
149
        out = []
150
        missing_revids = []
151
        missing_revid_indices = []
128.4.5 by Michael Hudson
reorganizations, cleanups. still utterly horrid though.
152
        cache = self._cache()
128.4.4 by Michael Hudson
use the sqlite not-shelf for the change cache too
153
        for revid in revid_list:
128.4.5 by Michael Hudson
reorganizations, cleanups. still utterly horrid though.
154
            entry = cache.get(revid)
128.4.4 by Michael Hudson
use the sqlite not-shelf for the change cache too
155
            if entry is not None:
156
                out.append(entry)
157
            else:
158
                missing_revids.append(revid)
159
                missing_revid_indices.append(len(out))
160
                out.append(None)
161
        if missing_revids:
162
            missing_entries = self.history.get_changes_uncached(missing_revids)
163
            missing_entry_dict = {}
164
            for entry in missing_entries:
165
                missing_entry_dict[entry.revid] = entry
166
            revid_entry_pairs = []
167
            for i, revid in zip(missing_revid_indices, missing_revids):
168
                out[i] = entry = missing_entry_dict.get(revid)
169
                if entry is not None:
170
                    revid_entry_pairs.append((revid, entry))
128.4.5 by Michael Hudson
reorganizations, cleanups. still utterly horrid though.
171
            cache.add(revid_entry_pairs)
128.4.4 by Michael Hudson
use the sqlite not-shelf for the change cache too
172
        return filter(None, out)
173
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
174
    @with_lock
128.2.7 by Robey Pointer
the diff cache isn't adding very much, and can grow very large. let's just
175
    def full(self):
128.4.5 by Michael Hudson
reorganizations, cleanups. still utterly horrid though.
176
        cache = self._cache()
128.4.4 by Michael Hudson
use the sqlite not-shelf for the change cache too
177
        last_revid = util.to_utf8(self.history.last_revid)
178
        revision_history = self.history.get_revision_history()
128.4.5 by Michael Hudson
reorganizations, cleanups. still utterly horrid though.
179
        return (cache.count() >= len(revision_history)
180
                and cache.get(last_revid) is not None)
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
181
73 by Robey Pointer
heh, duh. i can't leave the shelf files open from multiple threads at once.
182
    @with_lock
128.2.7 by Robey Pointer
the diff cache isn't adding very much, and can grow very large. let's just
183
    def size(self):
128.4.5 by Michael Hudson
reorganizations, cleanups. still utterly horrid though.
184
        return self._cache().count()
128.4.4 by Michael Hudson
use the sqlite not-shelf for the change cache too
185
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
186
    def check_rebuild(self, max_time=3600):
187
        """
188
        check if we need to fill in any missing pieces of the cache.  pull in
189
        any missing changes, but don't work any longer than C{max_time}
190
        seconds.
191
        """
69 by Robey Pointer
switch the cache and text index to use file locking so they can be used by
192
        if self.closed() or self.full():
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
193
            return
144.1.10 by Michael Hudson
run reindent.py over the loggerhead package
194
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
195
        self.log.info('Building revision cache...')
196
        start_time = time.time()
197
        last_update = time.time()
198
        count = 0
199
200
        work = list(self.history.get_revision_history())
201
        jump = 100
202
        for i in xrange(0, len(work), jump):
203
            r = work[i:i + jump]
49 by Robey Pointer
add top-level page listing available branches. also a patch from matty to not require external-url in atom feeds any more
204
            # must call into history so we grab the branch lock (otherwise, lock inversion)
205
            self.history.get_changes(r)
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
206
            if self.closed():
71 by Robey Pointer
exponential backoff isn't really working for the lockfile, so keep it down
207
                self.flush()
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
208
                return
209
            count += jump
210
            now = time.time()
211
            if now - start_time > max_time:
212
                self.log.info('Cache rebuilding will pause for now.')
213
                self.flush()
214
                return
215
            if now - last_update > 60:
216
                self.log.info('Revision cache rebuilding continues: %d/%d' % (min(count, len(work)), len(work)))
217
                last_update = time.time()
218
                self.flush()
71 by Robey Pointer
exponential backoff isn't really working for the lockfile, so keep it down
219
            # give someone else a chance at the lock
220
            time.sleep(1)
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
221
        self.log.info('Revision cache rebuild completed.')
48 by Robey Pointer
the big migration of branch-specific data to a BranchView object: actually
222
        self.flush()
47 by Robey Pointer
slowly moving the branch-specific stuff into a common structure...
223
128.1.55 by Michael Hudson
plumbing for a file change cache
224
class FileChangeCache(object):
225
    def __init__(self, history, cache_path):
226
        self.history = history
227
228
        if not os.path.exists(cache_path):
229
            os.mkdir(cache_path)
230
128.4.5 by Michael Hudson
reorganizations, cleanups. still utterly horrid though.
231
        self._changes_filename = os.path.join(cache_path, 'filechanges.sql')
128.1.55 by Michael Hudson
plumbing for a file change cache
232
233
        # use a lockfile since the cache folder could be shared across
234
        # different processes.
128.1.57 by Michael Hudson
use a different lock file for the different caches
235
        self._lock = LockFile(os.path.join(cache_path, 'filechange-lock'))
128.1.55 by Michael Hudson
plumbing for a file change cache
236
237
    @with_lock
238
    def get_file_changes(self, entries):
128.4.2 by Michael Hudson
rather brainlessly store the filechange cache in a sql database instead of a shelve store
239
        out = []
240
        missing_entries = []
241
        missing_entry_indices = []
128.4.14 by Michael Hudson
oops
242
        cache = FakeShelf(self._changes_filename)
128.4.2 by Michael Hudson
rather brainlessly store the filechange cache in a sql database instead of a shelve store
243
        for entry in entries:
128.4.5 by Michael Hudson
reorganizations, cleanups. still utterly horrid though.
244
            changes = cache.get(entry.revid)
128.4.2 by Michael Hudson
rather brainlessly store the filechange cache in a sql database instead of a shelve store
245
            if changes is not None:
246
                out.append(changes)
247
            else:
248
                missing_entries.append(entry)
249
                missing_entry_indices.append(len(out))
250
                out.append(None)
128.4.3 by Michael Hudson
use storm for the sqlite cache, insert objects in batches
251
        if missing_entries:
252
            missing_changes = self.history.get_file_changes_uncached(missing_entries)
253
            revid_changes_pairs = []
254
            for i, entry, changes in zip(
255
                missing_entry_indices, missing_entries, missing_changes):
256
                revid_changes_pairs.append((entry.revid, changes))
257
                out[i] = changes
128.4.5 by Michael Hudson
reorganizations, cleanups. still utterly horrid though.
258
            cache.add(revid_changes_pairs)
128.4.2 by Michael Hudson
rather brainlessly store the filechange cache in a sql database instead of a shelve store
259
        return out