~mnordhoff/loggerhead/cheezum : contents of loggerhead/wholehistory.py at revision 477

~mnordhoff/loggerhead/cheezum : (revision 477)
#
# Copyright (C) 2008, 2009 Canonical Ltd.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
"""Cache the whole history data needed by loggerhead about a branch."""

import gc
import logging
import time

from bzrlib.revision import is_null, NULL_REVISION

from loggerhead.util import StaticTuple


def _strip_NULL_ghosts(revision_graph):
    """
    Copied over from bzrlib meant as a temporary workaround for
    deprecated methods.
    """
    # Filter ghosts, and null:
    if NULL_REVISION in revision_graph:
        del revision_graph[NULL_REVISION]
    as_st = StaticTuple.from_sequence
    for key, parents in revision_graph.iteritems():
        revision_graph[key] = as_st([
            parent for parent in parents if parent in revision_graph])
    return revision_graph


def compute_whole_history_data(branch):
    """Compute _rev_info and _rev_indices for a branch.

    See History.__doc__ for what these data structures mean.
    """
    z = time.time()

    last_revid = branch.last_revision()

    log = logging.getLogger('loggerhead.%s' %
                            (branch.get_config().get_nickname(),))

    if is_null(last_revid):
        return ([], {})

    gc_enabled = gc.isenabled()
    if gc_enabled:
        gc.disable()
    try:
        # from bzrlib import commands
        # Profiling shows that the bulk of the time spent here is reading the
        # data out of the indexes, rather than time building and sorting the
        # graph. At least we're using code paths that can be optimized if
        # possible. Of course, ideally we wouldn't be
        # loading-the-whole-graph...
        # rev_info, rev_indices = commands.apply_lsprofiled(',,prof.txt',
        #   _compute_graph, branch, last_revid)
        rev_info, rev_indices = _compute_graph(branch, last_revid)
    finally:
        if gc_enabled:
            gc.enable()

    log.info('built revision graph cache: %.3f secs' % (time.time() - z,))
    return (rev_info, rev_indices)


def _compute_graph(branch, last_revid):
    """Do the actual work of computing the graph information."""
    # Using get_known_graph_ancestry drops us from 2.3s on bzr.dev down to
    # 0.9s. Wrapping this with a gc.disable call, drops us further to 0.7s.
    # This shows even better with mysql.
    #           orig    known_graph     gc.disable
    # bzr.dev   2.357   0.900           0.700
    # mysql     4.353   2.563           1.634
    last_key = (last_revid,)
    graph = branch.repository.revisions.get_known_graph_ancestry([last_key])
    # What about ghosts?
    merge_sorted = graph.merge_sort(last_key)

    rev_info = []
    rev_indices = {}

    get_parent_keys = graph.get_parent_keys
    get_child_keys = graph.get_child_keys
    # TODO: Use StaticTuple
    #       Using StaticTuple does show a memory reduction (85.6MB => 81.1MB
    #       peak on a MySQL branch). There doesn't seem to be a time-difference
    #       wrt how long it takes to build (probably because we have gc
    #       disabled?). StaticTuple should help in 'unrelated' code, since it
    #       reduces overall gc overhead. StaticTuple isn't trivial, as it
    #       interacts with the marshalling code.
    as_st = StaticTuple.from_sequence
    for seq, info in enumerate(merge_sorted):
        #seq, revid, merge_depth, revno, end_of_merge = info
        # Switch back from a tuple key to a simple string rev_id
        rev_id = info.key[-1]
        revno_str = '.'.join(map(str, info.revno))
        parent_ids = as_st([p[-1] for p in get_parent_keys(info.key)])
        rev_indices[rev_id] = len(rev_info)
        # TODO: Try using the original merge_sorted object. Gives us a nice
        #       Object.foo rather than entry[0][1] syntax. However would need
        #       special handling for the caching layer
        basic_info = StaticTuple(seq, rev_id, info.merge_depth, revno_str,
                                 info.end_of_merge)
        if info.merge_depth != 0:
            # Find the children of this revision
            child_ids = as_st([c[-1] for c in get_child_keys(info.key)])
        else:
            child_ids = StaticTuple()
        rev_info.append(StaticTuple(basic_info, child_ids, parent_ids))
    return rev_info, rev_indices