39
43
CHANGE_COPY = 'copy'
40
44
CHANGE_UNCHANGED = 'unchanged'
46
RENAME_CHANGE_TYPES = (CHANGE_RENAME, CHANGE_COPY)
42
48
_NULL_ENTRY = TreeEntry(None, None, None)
45
_RENAME_THRESHOLD = 60
47
_REWRITE_THRESHOLD = None
50
class TreeChange(TreeChangeTuple):
51
"""Class encapsulating a single change between two trees."""
53
REWRITE_THRESHOLD = None
56
class TreeChange(namedtuple('TreeChange', ['type', 'old', 'new'])):
57
"""Named tuple a single change between two trees."""
155
def tree_changes(store, tree1_id, tree2_id, want_unchanged=False):
161
def tree_changes(store, tree1_id, tree2_id, want_unchanged=False,
162
rename_detector=None):
156
163
"""Find the differences between the contents of two trees.
158
165
:param store: An ObjectStore for looking up objects.
160
167
:param tree2_id: The SHA of the target tree.
161
168
:param want_unchanged: If True, include TreeChanges for unmodified entries
170
:param rename_detector: RenameDetector object for detecting renames.
163
171
:return: Iterator over TreeChange instances for each change between the
164
172
source and target tree.
174
if (rename_detector is not None and tree1_id is not None and
175
tree2_id is not None):
176
for change in rename_detector.changes_with_renames(
177
tree1_id, tree2_id, want_unchanged=want_unchanged):
166
181
entries = walk_trees(store, tree1_id, tree2_id,
167
182
prune_identical=(not want_unchanged))
168
183
for entry1, entry2 in entries:
193
208
yield TreeChange(change_type, entry1, entry2)
211
def _all_eq(seq, key, value):
218
def _all_same(seq, key):
219
return _all_eq(seq[1:], key, key(seq[0]))
222
def tree_changes_for_merge(store, parent_tree_ids, tree_id,
223
rename_detector=None):
224
"""Get the tree changes for a merge tree relative to all its parents.
226
:param store: An ObjectStore for looking up objects.
227
:param parent_tree_ids: An iterable of the SHAs of the parent trees.
228
:param tree_id: The SHA of the merge tree.
229
:param rename_detector: RenameDetector object for detecting renames.
231
:yield: Lists of TreeChange objects, one per conflicted path in the merge.
233
Each list contains one element per parent, with the TreeChange for that
234
path relative to that parent. An element may be None if it never existed
235
in one parent and was deleted in two others.
237
A path is only included in the output if it is a conflict, i.e. its SHA
238
in the merge tree is not found in any of the parents, or in the case of
239
deletes, if not all of the old SHAs match.
241
all_parent_changes = [tree_changes(store, t, tree_id,
242
rename_detector=rename_detector)
243
for t in parent_tree_ids]
244
num_parents = len(parent_tree_ids)
245
changes_by_path = defaultdict(lambda: [None] * num_parents)
248
for i, parent_changes in enumerate(all_parent_changes):
249
for change in parent_changes:
250
if change.type == CHANGE_DELETE:
251
path = change.old.path
253
path = change.new.path
254
changes_by_path[path][i] = change
256
old_sha = lambda c: c.old.sha
257
change_type = lambda c: c.type
259
# Yield only conflicting changes.
260
for _, changes in sorted(changes_by_path.iteritems()):
261
assert len(changes) == num_parents
262
have = [c for c in changes if c is not None]
263
if _all_eq(have, change_type, CHANGE_DELETE):
264
if not _all_same(have, old_sha):
266
elif not _all_same(have, change_type):
268
elif None not in changes:
269
# If no change was found relative to one parent, that means the SHA
270
# must have matched the SHA in that parent, so it is not a conflict.
287
365
class RenameDetector(object):
288
366
"""Object for handling rename detection between two trees."""
290
def __init__(self, store, tree1_id, tree2_id,
291
rename_threshold=_RENAME_THRESHOLD, max_files=_MAX_FILES,
292
rewrite_threshold=_REWRITE_THRESHOLD,
368
def __init__(self, store, rename_threshold=RENAME_THRESHOLD,
370
rewrite_threshold=REWRITE_THRESHOLD,
293
371
find_copies_harder=False):
294
372
"""Initialize the rename detector.
296
374
:param store: An ObjectStore for looking up objects.
297
:param tree1_id: The SHA of the first Tree.
298
:param tree2_id: The SHA of the second Tree.
299
375
:param rename_threshold: The threshold similarity score for considering
300
376
an add/delete pair to be a rename/copy; see _similarity_score.
301
377
:param max_files: The maximum number of adds and deletes to consider, or
309
385
:param find_copies_harder: If True, consider unmodified files when
310
386
detecting copies.
312
self._tree1_id = tree1_id
313
self._tree2_id = tree2_id
314
388
self._store = store
315
389
self._rename_threshold = rename_threshold
316
390
self._rewrite_threshold = rewrite_threshold
317
391
self._max_files = max_files
318
392
self._find_copies_harder = find_copies_harder
393
self._want_unchanged = False
321
397
self._deletes = []
322
398
self._changes = []
329
405
new_obj = self._store[change.new.sha]
330
406
return _similarity_score(old_obj, new_obj) < self._rewrite_threshold
332
def _collect_changes(self):
333
for change in tree_changes(self._store, self._tree1_id, self._tree2_id,
334
want_unchanged=self._find_copies_harder):
408
def _collect_changes(self, tree1_id, tree2_id):
409
want_unchanged = self._find_copies_harder or self._want_unchanged
410
for change in tree_changes(self._store, tree1_id, tree2_id,
411
want_unchanged=want_unchanged):
335
412
if change.type == CHANGE_ADD:
336
413
self._adds.append(change)
337
414
elif change.type == CHANGE_DELETE:
339
416
elif self._should_split(change):
340
417
self._deletes.append(TreeChange.delete(change.old))
341
418
self._adds.append(TreeChange.add(change.new))
342
elif (self._find_copies_harder and (
343
change.type == CHANGE_MODIFY or change.type == CHANGE_UNCHANGED)):
344
# Treat modified/unchanged as deleted rather than splitting it,
345
# to avoid spurious renames.
419
elif ((self._find_copies_harder and change.type == CHANGE_UNCHANGED)
420
or change.type == CHANGE_MODIFY):
421
# Treat all modifies as potential deletes for rename detection,
422
# but don't split them (to avoid spurious renames). Setting
423
# find_copies_harder means we treat unchanged the same as
346
425
self._deletes.append(change)
348
427
self._changes.append(change)
359
438
delete_map = defaultdict(list)
360
439
for delete in self._deletes:
361
440
# Keep track of whether the delete was actually marked as a delete.
362
# If not, it must have been added due to find_copies_harder, and
363
# needs to be marked as a copy.
441
# If not, it needs to be marked as a copy.
364
442
is_delete = delete.type == CHANGE_DELETE
365
443
delete_map[delete.old.sha].append((delete.old, is_delete))
371
449
for (old, is_delete), new in itertools.izip(sha_deletes, sha_adds):
372
450
if stat.S_IFMT(old.mode) != stat.S_IFMT(new.mode):
374
delete_paths.add(old.path)
453
delete_paths.add(old.path)
375
454
add_paths.add(new.path)
376
455
new_type = is_delete and CHANGE_RENAME or CHANGE_COPY
377
456
self._changes.append(TreeChange(new_type, old, new))
385
464
self._changes.append(TreeChange(CHANGE_COPY, old, new))
386
465
self._prune(add_paths, delete_paths)
388
def _find_content_renames(self):
467
def _should_find_content_renames(self):
468
return len(self._adds) * len(self._deletes) <= self._max_files ** 2
470
def _rename_type(self, check_paths, delete, add):
471
if check_paths and delete.old.path == add.new.path:
472
# If the paths match, this must be a split modify, so make sure it
473
# comes out as a modify.
475
elif delete.type != CHANGE_DELETE:
476
# If it's in deletes but not marked as a delete, it must have been
477
# added due to find_copies_harder, and needs to be marked as a copy.
481
def _find_content_rename_candidates(self):
482
candidates = self._candidates = []
389
483
# TODO: Optimizations:
390
484
# - Compare object sizes before counting blocks.
391
485
# - Skip if delete's S_IFMT differs from all adds.
392
486
# - Skip if adds or deletes is empty.
393
487
# Match C git's behavior of not attempting to find content renames if
394
488
# the matrix size exceeds the threshold.
395
if len(self._adds) * len(self._deletes) > self._max_files ** 2:
489
if not self._should_find_content_renames():
398
492
check_paths = self._rename_threshold is not None
400
493
for delete in self._deletes:
401
494
if S_ISGITLINK(delete.old.mode):
402
495
continue # Git links don't exist in this repo.
410
503
score = _similarity_score(old_obj, new_obj,
411
504
block_cache={old_sha: old_blocks})
412
505
if score > self._rename_threshold:
413
if check_paths and delete.old.path == add.new.path:
414
# If the paths match, this must be a split modify, so
415
# make sure it comes out as a modify.
416
new_type = CHANGE_MODIFY
417
elif delete.type != CHANGE_DELETE:
418
# If it's in deletes but not marked as a delete, it must
419
# have been added due to find_copies_harder, and needs
420
# to be marked as a copy.
421
new_type = CHANGE_COPY
423
new_type = CHANGE_RENAME
506
new_type = self._rename_type(check_paths, delete, add)
424
507
rename = TreeChange(new_type, delete.old, add.new)
425
508
candidates.append((-score, rename))
510
def _choose_content_renames(self):
427
511
# Sort scores from highest to lowest, but keep names in ascending order.
512
self._candidates.sort()
430
514
delete_paths = set()
431
515
add_paths = set()
432
for _, change in candidates:
516
for _, change in self._candidates:
433
517
new_path = change.new.path
434
518
if new_path in add_paths:
474
558
def _prune_unchanged(self):
559
if self._want_unchanged:
475
561
self._deletes = [d for d in self._deletes if d.type != CHANGE_UNCHANGED]
477
def changes_with_renames(self):
478
"""Iterate TreeChanges between the two trees, with rename detection."""
479
self._collect_changes()
563
def changes_with_renames(self, tree1_id, tree2_id, want_unchanged=False):
564
"""Iterate TreeChanges between two tree SHAs, with rename detection."""
566
self._want_unchanged = want_unchanged
567
self._collect_changes(tree1_id, tree2_id)
480
568
self._find_exact_renames()
481
self._find_content_renames()
569
self._find_content_rename_candidates()
570
self._choose_content_renames()
482
571
self._join_modifies()
483
572
self._prune_unchanged()
484
573
return self._sorted_changes()