~ubuntu-branches/ubuntu/precise/dulwich/precise-security

« back to all changes in this revision

Viewing changes to dulwich/diff_tree.py

  • Committer: Bazaar Package Importer
  • Author(s): Jelmer Vernooij
  • Date: 2011-08-07 15:03:44 UTC
  • mto: This revision was merged to the branch mainline in revision 25.
  • Revision ID: james.westby@ubuntu.com-20110807150344-94xw3o3hnh47y1m8
Tags: upstream-0.8.0
ImportĀ upstreamĀ versionĀ 0.8.0

Show diffs side-by-side

added added

removed removed

Lines of Context:
18
18
 
19
19
"""Utilities for diffing files and trees."""
20
20
 
 
21
try:
 
22
    from collections import defaultdict
 
23
except ImportError:
 
24
    from dulwich._compat import defaultdict
 
25
 
21
26
from cStringIO import StringIO
22
27
import itertools
23
28
import stat
24
29
 
25
30
from dulwich._compat import (
26
 
    defaultdict,
27
 
    TreeChangeTuple,
 
31
    namedtuple,
28
32
    )
29
33
from dulwich.objects import (
30
34
    S_ISGITLINK,
39
43
CHANGE_COPY = 'copy'
40
44
CHANGE_UNCHANGED = 'unchanged'
41
45
 
 
46
RENAME_CHANGE_TYPES = (CHANGE_RENAME, CHANGE_COPY)
 
47
 
42
48
_NULL_ENTRY = TreeEntry(None, None, None)
43
49
 
44
50
_MAX_SCORE = 100
45
 
_RENAME_THRESHOLD = 60
46
 
_MAX_FILES = 200
47
 
_REWRITE_THRESHOLD = None
48
 
 
49
 
 
50
 
class TreeChange(TreeChangeTuple):
51
 
    """Class encapsulating a single change between two trees."""
 
51
RENAME_THRESHOLD = 60
 
52
MAX_FILES = 200
 
53
REWRITE_THRESHOLD = None
 
54
 
 
55
 
 
56
class TreeChange(namedtuple('TreeChange', ['type', 'old', 'new'])):
 
57
    """Named tuple a single change between two trees."""
52
58
 
53
59
    @classmethod
54
60
    def add(cls, new):
152
158
    return entry
153
159
 
154
160
 
155
 
def tree_changes(store, tree1_id, tree2_id, want_unchanged=False):
 
161
def tree_changes(store, tree1_id, tree2_id, want_unchanged=False,
 
162
                 rename_detector=None):
156
163
    """Find the differences between the contents of two trees.
157
164
 
158
165
    :param store: An ObjectStore for looking up objects.
160
167
    :param tree2_id: The SHA of the target tree.
161
168
    :param want_unchanged: If True, include TreeChanges for unmodified entries
162
169
        as well.
 
170
    :param rename_detector: RenameDetector object for detecting renames.
163
171
    :return: Iterator over TreeChange instances for each change between the
164
172
        source and target tree.
165
173
    """
 
174
    if (rename_detector is not None and tree1_id is not None and
 
175
        tree2_id is not None):
 
176
        for change in rename_detector.changes_with_renames(
 
177
          tree1_id, tree2_id, want_unchanged=want_unchanged):
 
178
            yield change
 
179
        return
 
180
 
166
181
    entries = walk_trees(store, tree1_id, tree2_id,
167
182
                         prune_identical=(not want_unchanged))
168
183
    for entry1, entry2 in entries:
193
208
        yield TreeChange(change_type, entry1, entry2)
194
209
 
195
210
 
 
211
def _all_eq(seq, key, value):
 
212
    for e in seq:
 
213
        if key(e) != value:
 
214
            return False
 
215
    return True
 
216
 
 
217
 
 
218
def _all_same(seq, key):
 
219
    return _all_eq(seq[1:], key, key(seq[0]))
 
220
 
 
221
 
 
222
def tree_changes_for_merge(store, parent_tree_ids, tree_id,
 
223
                           rename_detector=None):
 
224
    """Get the tree changes for a merge tree relative to all its parents.
 
225
 
 
226
    :param store: An ObjectStore for looking up objects.
 
227
    :param parent_tree_ids: An iterable of the SHAs of the parent trees.
 
228
    :param tree_id: The SHA of the merge tree.
 
229
    :param rename_detector: RenameDetector object for detecting renames.
 
230
 
 
231
    :yield: Lists of TreeChange objects, one per conflicted path in the merge.
 
232
 
 
233
        Each list contains one element per parent, with the TreeChange for that
 
234
        path relative to that parent. An element may be None if it never existed
 
235
        in one parent and was deleted in two others.
 
236
 
 
237
        A path is only included in the output if it is a conflict, i.e. its SHA
 
238
        in the merge tree is not found in any of the parents, or in the case of
 
239
        deletes, if not all of the old SHAs match.
 
240
    """
 
241
    all_parent_changes = [tree_changes(store, t, tree_id,
 
242
                                       rename_detector=rename_detector)
 
243
                          for t in parent_tree_ids]
 
244
    num_parents = len(parent_tree_ids)
 
245
    changes_by_path = defaultdict(lambda: [None] * num_parents)
 
246
 
 
247
    # Organize by path.
 
248
    for i, parent_changes in enumerate(all_parent_changes):
 
249
        for change in parent_changes:
 
250
            if change.type == CHANGE_DELETE:
 
251
                path = change.old.path
 
252
            else:
 
253
                path = change.new.path
 
254
            changes_by_path[path][i] = change
 
255
 
 
256
    old_sha = lambda c: c.old.sha
 
257
    change_type = lambda c: c.type
 
258
 
 
259
    # Yield only conflicting changes.
 
260
    for _, changes in sorted(changes_by_path.iteritems()):
 
261
        assert len(changes) == num_parents
 
262
        have = [c for c in changes if c is not None]
 
263
        if _all_eq(have, change_type, CHANGE_DELETE):
 
264
            if not _all_same(have, old_sha):
 
265
                yield changes
 
266
        elif not _all_same(have, change_type):
 
267
            yield changes
 
268
        elif None not in changes:
 
269
            # If no change was found relative to one parent, that means the SHA
 
270
            # must have matched the SHA in that parent, so it is not a conflict.
 
271
            yield changes
 
272
 
 
273
 
196
274
_BLOCK_SIZE = 64
197
275
 
198
276
 
287
365
class RenameDetector(object):
288
366
    """Object for handling rename detection between two trees."""
289
367
 
290
 
    def __init__(self, store, tree1_id, tree2_id,
291
 
                 rename_threshold=_RENAME_THRESHOLD, max_files=_MAX_FILES,
292
 
                 rewrite_threshold=_REWRITE_THRESHOLD,
 
368
    def __init__(self, store, rename_threshold=RENAME_THRESHOLD,
 
369
                 max_files=MAX_FILES,
 
370
                 rewrite_threshold=REWRITE_THRESHOLD,
293
371
                 find_copies_harder=False):
294
372
        """Initialize the rename detector.
295
373
 
296
374
        :param store: An ObjectStore for looking up objects.
297
 
        :param tree1_id: The SHA of the first Tree.
298
 
        :param tree2_id: The SHA of the second Tree.
299
375
        :param rename_threshold: The threshold similarity score for considering
300
376
            an add/delete pair to be a rename/copy; see _similarity_score.
301
377
        :param max_files: The maximum number of adds and deletes to consider, or
309
385
        :param find_copies_harder: If True, consider unmodified files when
310
386
            detecting copies.
311
387
        """
312
 
        self._tree1_id = tree1_id
313
 
        self._tree2_id = tree2_id
314
388
        self._store = store
315
389
        self._rename_threshold = rename_threshold
316
390
        self._rewrite_threshold = rewrite_threshold
317
391
        self._max_files = max_files
318
392
        self._find_copies_harder = find_copies_harder
 
393
        self._want_unchanged = False
319
394
 
 
395
    def _reset(self):
320
396
        self._adds = []
321
397
        self._deletes = []
322
398
        self._changes = []
329
405
        new_obj = self._store[change.new.sha]
330
406
        return _similarity_score(old_obj, new_obj) < self._rewrite_threshold
331
407
 
332
 
    def _collect_changes(self):
333
 
        for change in tree_changes(self._store, self._tree1_id, self._tree2_id,
334
 
                                   want_unchanged=self._find_copies_harder):
 
408
    def _collect_changes(self, tree1_id, tree2_id):
 
409
        want_unchanged = self._find_copies_harder or self._want_unchanged
 
410
        for change in tree_changes(self._store, tree1_id, tree2_id,
 
411
                                   want_unchanged=want_unchanged):
335
412
            if change.type == CHANGE_ADD:
336
413
                self._adds.append(change)
337
414
            elif change.type == CHANGE_DELETE:
339
416
            elif self._should_split(change):
340
417
                self._deletes.append(TreeChange.delete(change.old))
341
418
                self._adds.append(TreeChange.add(change.new))
342
 
            elif (self._find_copies_harder and (
343
 
              change.type == CHANGE_MODIFY or change.type == CHANGE_UNCHANGED)):
344
 
                # Treat modified/unchanged as deleted rather than splitting it,
345
 
                # to avoid spurious renames.
 
419
            elif ((self._find_copies_harder and change.type == CHANGE_UNCHANGED)
 
420
                  or change.type == CHANGE_MODIFY):
 
421
                # Treat all modifies as potential deletes for rename detection,
 
422
                # but don't split them (to avoid spurious renames). Setting
 
423
                # find_copies_harder means we treat unchanged the same as
 
424
                # modified.
346
425
                self._deletes.append(change)
347
426
            else:
348
427
                self._changes.append(change)
359
438
        delete_map = defaultdict(list)
360
439
        for delete in self._deletes:
361
440
            # Keep track of whether the delete was actually marked as a delete.
362
 
            # If not, it must have been added due to find_copies_harder, and
363
 
            # needs to be marked as a copy.
 
441
            # If not, it needs to be marked as a copy.
364
442
            is_delete = delete.type == CHANGE_DELETE
365
443
            delete_map[delete.old.sha].append((delete.old, is_delete))
366
444
 
371
449
            for (old, is_delete), new in itertools.izip(sha_deletes, sha_adds):
372
450
                if stat.S_IFMT(old.mode) != stat.S_IFMT(new.mode):
373
451
                    continue
374
 
                delete_paths.add(old.path)
 
452
                if is_delete:
 
453
                    delete_paths.add(old.path)
375
454
                add_paths.add(new.path)
376
455
                new_type = is_delete and CHANGE_RENAME or CHANGE_COPY
377
456
                self._changes.append(TreeChange(new_type, old, new))
385
464
                    self._changes.append(TreeChange(CHANGE_COPY, old, new))
386
465
        self._prune(add_paths, delete_paths)
387
466
 
388
 
    def _find_content_renames(self):
 
467
    def _should_find_content_renames(self):
 
468
        return len(self._adds) * len(self._deletes) <= self._max_files ** 2
 
469
 
 
470
    def _rename_type(self, check_paths, delete, add):
 
471
        if check_paths and delete.old.path == add.new.path:
 
472
            # If the paths match, this must be a split modify, so make sure it
 
473
            # comes out as a modify.
 
474
            return CHANGE_MODIFY
 
475
        elif delete.type != CHANGE_DELETE:
 
476
            # If it's in deletes but not marked as a delete, it must have been
 
477
            # added due to find_copies_harder, and needs to be marked as a copy.
 
478
            return CHANGE_COPY
 
479
        return CHANGE_RENAME
 
480
 
 
481
    def _find_content_rename_candidates(self):
 
482
        candidates = self._candidates = []
389
483
        # TODO: Optimizations:
390
484
        #  - Compare object sizes before counting blocks.
391
485
        #  - Skip if delete's S_IFMT differs from all adds.
392
486
        #  - Skip if adds or deletes is empty.
393
487
        # Match C git's behavior of not attempting to find content renames if
394
488
        # the matrix size exceeds the threshold.
395
 
        if len(self._adds) * len(self._deletes) > self._max_files ** 2:
 
489
        if not self._should_find_content_renames():
396
490
            return
397
491
 
398
492
        check_paths = self._rename_threshold is not None
399
 
        candidates = []
400
493
        for delete in self._deletes:
401
494
            if S_ISGITLINK(delete.old.mode):
402
495
                continue  # Git links don't exist in this repo.
410
503
                score = _similarity_score(old_obj, new_obj,
411
504
                                          block_cache={old_sha: old_blocks})
412
505
                if score > self._rename_threshold:
413
 
                    if check_paths and delete.old.path == add.new.path:
414
 
                        # If the paths match, this must be a split modify, so
415
 
                        # make sure it comes out as a modify.
416
 
                        new_type = CHANGE_MODIFY
417
 
                    elif delete.type != CHANGE_DELETE:
418
 
                        # If it's in deletes but not marked as a delete, it must
419
 
                        # have been added due to find_copies_harder, and needs
420
 
                        # to be marked as a copy.
421
 
                        new_type = CHANGE_COPY
422
 
                    else:
423
 
                        new_type = CHANGE_RENAME
 
506
                    new_type = self._rename_type(check_paths, delete, add)
424
507
                    rename = TreeChange(new_type, delete.old, add.new)
425
508
                    candidates.append((-score, rename))
426
509
 
 
510
    def _choose_content_renames(self):
427
511
        # Sort scores from highest to lowest, but keep names in ascending order.
428
 
        candidates.sort()
 
512
        self._candidates.sort()
429
513
 
430
514
        delete_paths = set()
431
515
        add_paths = set()
432
 
        for _, change in candidates:
 
516
        for _, change in self._candidates:
433
517
            new_path = change.new.path
434
518
            if new_path in add_paths:
435
519
                continue
472
556
        return result
473
557
 
474
558
    def _prune_unchanged(self):
 
559
        if self._want_unchanged:
 
560
            return
475
561
        self._deletes = [d for d in self._deletes if d.type != CHANGE_UNCHANGED]
476
562
 
477
 
    def changes_with_renames(self):
478
 
        """Iterate TreeChanges between the two trees, with rename detection."""
479
 
        self._collect_changes()
 
563
    def changes_with_renames(self, tree1_id, tree2_id, want_unchanged=False):
 
564
        """Iterate TreeChanges between two tree SHAs, with rename detection."""
 
565
        self._reset()
 
566
        self._want_unchanged = want_unchanged
 
567
        self._collect_changes(tree1_id, tree2_id)
480
568
        self._find_exact_renames()
481
 
        self._find_content_renames()
 
569
        self._find_content_rename_candidates()
 
570
        self._choose_content_renames()
482
571
        self._join_modifies()
483
572
        self._prune_unchanged()
484
573
        return self._sorted_changes()