2
This code is derived from jgit (http://eclipse.org/jgit).
3
Copyright owners are documented in jgit's IP log.
5
This program and the accompanying materials are made available
6
under the terms of the Eclipse Distribution License v1.0 which
7
accompanies this distribution, is reproduced below, and is
8
available at http://www.eclipse.org/org/documents/edl-v10.php
12
Redistribution and use in source and binary forms, with or
13
without modification, are permitted provided that the following
16
- Redistributions of source code must retain the above copyright
17
notice, this list of conditions and the following disclaimer.
19
- Redistributions in binary form must reproduce the above
20
copyright notice, this list of conditions and the following
21
disclaimer in the documentation and/or other materials provided
22
with the distribution.
24
- Neither the name of the Eclipse Foundation, Inc. nor the
25
names of its contributors may be used to endorse or promote
26
products derived from this software without specific prior
29
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
30
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
31
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
32
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
34
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
35
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
36
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
37
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
38
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
39
STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
40
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
41
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45
using System.Collections.Generic;
52
internal class SimilarityRenameDetector
54
/// <summary>Number of bits we need to express an index into src or dst list.</summary>
56
/// Number of bits we need to express an index into src or dst list.
58
/// This must be 28, giving us a limit of 2^28 entries in either list, which
59
/// is an insane limit of 536,870,912 file names being considered in a single
60
/// rename pass. The other 8 bits are used to store the score, while staying
61
/// under 127 so the long doesn't go negative.
63
private const int BITS_PER_INDEX = 28;
65
private const int INDEX_MASK = (1 << BITS_PER_INDEX) - 1;
67
private const int SCORE_SHIFT = 2 * BITS_PER_INDEX;
69
private ContentSource.Pair reader;
71
/// <summary>All sources to consider for copies or renames.</summary>
73
/// All sources to consider for copies or renames.
75
/// A source is typically a
76
/// <see cref="ChangeType.DELETE">ChangeType.DELETE</see>
77
/// change, but could be
78
/// another type when trying to perform copy detection concurrently with
81
private IList<DiffEntry> srcs;
83
/// <summary>All destinations to consider looking for a rename.</summary>
85
/// All destinations to consider looking for a rename.
87
/// A destination is typically an
88
/// <see cref="ChangeType.ADD">ChangeType.ADD</see>
90
/// just come into existence, and we want to discover where its initial
91
/// content came from.
93
private IList<DiffEntry> dsts;
95
/// <summary>Matrix of all examined file pairs, and their scores.</summary>
97
/// Matrix of all examined file pairs, and their scores.
99
/// The upper 8 bits of each long stores the score, but the score is bounded
100
/// to be in the range (0, 128] so that the highest bit is never set, and all
101
/// entries are therefore positive.
103
/// List indexes to an element of
104
/// <see cref="srcs">srcs</see>
106
/// <see cref="dsts">dsts</see>
108
/// as the lower two groups of 28 bits, respectively, but the encoding is
109
/// inverted, so that 0 is expressed as
110
/// <code>(1 << 28) - 1</code>
112
/// lower list indices later in the matrix, giving precedence to files whose
113
/// names sort earlier in the tree.
115
private long[] matrix;
117
/// <summary>Score a pair must exceed to be considered a rename.</summary>
118
/// <remarks>Score a pair must exceed to be considered a rename.</remarks>
119
private int renameScore = 60;
123
/// <see cref="TableFullException">TableFullException</see>
126
private bool tableOverflow;
128
private IList<DiffEntry> @out;
130
internal SimilarityRenameDetector(ContentSource.Pair reader, IList<DiffEntry> srcs
131
, IList<DiffEntry> dsts)
133
this.reader = reader;
138
internal virtual void SetRenameScore(int score)
143
/// <exception cref="System.IO.IOException"></exception>
144
internal virtual void Compute(ProgressMonitor pm)
148
pm = NullProgressMonitor.INSTANCE;
150
pm.BeginTask(JGitText.Get().renamesFindingByContent, 2 * srcs.Count * dsts.Count);
152
int mNext = BuildMatrix(pm);
153
@out = new AList<DiffEntry>(Math.Min(mNext, dsts.Count));
154
// Match rename pairs on a first come, first serve basis until
155
// we have looked at everything that is above our minimum score.
157
for (--mNext; mNext >= 0; mNext--)
159
long ent = matrix[mNext];
160
int sIdx = SrcFile(ent);
161
int dIdx = DstFile(ent);
162
DiffEntry s = srcs[sIdx];
163
DiffEntry d = dsts[dIdx];
169
// was already matched earlier
170
DiffEntry.ChangeType type;
171
if (s.changeType == DiffEntry.ChangeType.DELETE)
173
// First use of this source file. Tag it as a rename so we
174
// later know it is already been used as a rename, other
175
// matches (if any) will claim themselves as copies instead.
177
s.changeType = DiffEntry.ChangeType.RENAME;
178
type = DiffEntry.ChangeType.RENAME;
182
type = DiffEntry.ChangeType.COPY;
184
@out.AddItem(DiffEntry.Pair(type, s, d, Score(ent)));
185
dsts.Set(dIdx, null);
186
// Claim the destination was matched.
189
srcs = CompactSrcList(srcs);
190
dsts = CompactDstList(dsts);
194
internal virtual IList<DiffEntry> GetMatches()
199
internal virtual IList<DiffEntry> GetLeftOverSources()
204
internal virtual IList<DiffEntry> GetLeftOverDestinations()
209
internal virtual bool IsTableOverflow()
211
return tableOverflow;
214
private static IList<DiffEntry> CompactSrcList(IList<DiffEntry> @in)
216
AList<DiffEntry> r = new AList<DiffEntry>(@in.Count);
217
foreach (DiffEntry e in @in)
219
if (e.changeType == DiffEntry.ChangeType.DELETE)
227
private static IList<DiffEntry> CompactDstList(IList<DiffEntry> @in)
229
AList<DiffEntry> r = new AList<DiffEntry>(@in.Count);
230
foreach (DiffEntry e in @in)
240
/// <exception cref="System.IO.IOException"></exception>
241
private int BuildMatrix(ProgressMonitor pm)
243
// Allocate for the worst-case scenario where every pair has a
244
// score that we need to consider. We might not need that many.
246
matrix = new long[srcs.Count * dsts.Count];
247
long[] srcSizes = new long[srcs.Count];
248
long[] dstSizes = new long[dsts.Count];
249
BitSet dstTooLarge = null;
250
// Consider each pair of files, if the score is above the minimum
251
// threshold we need record that scoring in the matrix so we can
252
// later find the best matches.
255
for (int srcIdx = 0; srcIdx < srcs.Count; srcIdx++)
257
DiffEntry srcEnt = srcs[srcIdx];
258
if (!IsFile(srcEnt.oldMode))
260
pm.Update(dsts.Count);
263
SimilarityIndex s = null;
264
for (int dstIdx = 0; dstIdx < dsts.Count; dstIdx++)
266
DiffEntry dstEnt = dsts[dstIdx];
267
if (!IsFile(dstEnt.newMode))
272
if (!RenameDetector.SameType(srcEnt.oldMode, dstEnt.newMode))
277
if (dstTooLarge != null && dstTooLarge.Get(dstIdx))
282
long srcSize = srcSizes[srcIdx];
285
srcSize = Size(DiffEntry.Side.OLD, srcEnt) + 1;
286
srcSizes[srcIdx] = srcSize;
288
long dstSize = dstSizes[dstIdx];
291
dstSize = Size(DiffEntry.Side.NEW, dstEnt) + 1;
292
dstSizes[dstIdx] = dstSize;
294
long max = Math.Max(srcSize, dstSize);
295
long min = Math.Min(srcSize, dstSize);
296
if (min * 100 / max < renameScore)
298
// Cannot possibly match, as the file sizes are so different
306
s = Hash(DiffEntry.Side.OLD, srcEnt);
308
catch (SimilarityIndex.TableFullException)
310
tableOverflow = true;
317
d = Hash(DiffEntry.Side.NEW, dstEnt);
319
catch (SimilarityIndex.TableFullException)
321
if (dstTooLarge == null)
323
dstTooLarge = new BitSet(dsts.Count);
325
dstTooLarge.Set(dstIdx);
326
tableOverflow = true;
330
int contentScore = s.Score(d, 10000);
331
// nameScore returns a value between 0 and 100, but we want it
332
// to be in the same range as the content score. This allows it
333
// to be dropped into the pretty formula for the final score.
334
int nameScore = NameScore(srcEnt.oldPath, dstEnt.newPath) * 100;
335
int score = (contentScore * 99 + nameScore * 1) / 10000;
336
if (score < renameScore)
341
matrix[mNext++] = Encode(score, srcIdx, dstIdx);
347
// Sort everything in the range we populated, which might be the
348
// entire matrix, or just a smaller slice if we had some bad low
351
Arrays.Sort(matrix, 0, mNext);
355
internal static int NameScore(string a, string b)
357
int aDirLen = a.LastIndexOf("/") + 1;
358
int bDirLen = b.LastIndexOf("/") + 1;
359
int dirMin = Math.Min(aDirLen, bDirLen);
360
int dirMax = Math.Max(aDirLen, bDirLen);
371
for (; dirSim < dirMin; dirSim++)
373
if (a[dirSim] != b[dirSim])
378
dirScoreLtr = (dirSim * 100) / dirMax;
379
if (dirScoreLtr == 100)
385
for (dirSim = 0; dirSim < dirMin; dirSim++)
387
if (a[aDirLen - 1 - dirSim] != b[bDirLen - 1 - dirSim])
392
dirScoreRtl = (dirSim * 100) / dirMax;
395
int fileMin = Math.Min(a.Length - aDirLen, b.Length - bDirLen);
396
int fileMax = Math.Max(a.Length - aDirLen, b.Length - bDirLen);
398
for (; fileSim < fileMin; fileSim++)
400
if (a[a.Length - 1 - fileSim] != b[b.Length - 1 - fileSim])
405
int fileScore = (fileSim * 100) / fileMax;
406
return (((dirScoreLtr + dirScoreRtl) * 25) + (fileScore * 50)) / 100;
409
/// <exception cref="System.IO.IOException"></exception>
410
/// <exception cref="NGit.Diff.SimilarityIndex.TableFullException"></exception>
411
private SimilarityIndex Hash(DiffEntry.Side side, DiffEntry ent)
413
SimilarityIndex r = new SimilarityIndex();
414
r.Hash(reader.Open(side, ent));
419
/// <exception cref="System.IO.IOException"></exception>
420
private long Size(DiffEntry.Side side, DiffEntry ent)
422
return reader.Size(side, ent);
425
private static int Score(long value)
427
return (int)((long)(((ulong)value) >> SCORE_SHIFT));
430
internal static int SrcFile(long value)
432
return DecodeFile(((int)((long)(((ulong)value) >> BITS_PER_INDEX))) & INDEX_MASK);
435
internal static int DstFile(long value)
437
return DecodeFile(((int)value) & INDEX_MASK);
440
internal static long Encode(int score, int srcIdx, int dstIdx)
442
return (((long)score) << SCORE_SHIFT) | (EncodeFile(srcIdx) << BITS_PER_INDEX) |
448
private static long EncodeFile(int idx)
450
// We invert the index so that the first file in the list sorts
451
// later in the table. This permits us to break ties favoring
452
// earlier names over later ones.
454
return INDEX_MASK - idx;
457
private static int DecodeFile(int v)
459
return INDEX_MASK - v;
462
private static bool IsFile(FileMode mode)
464
return (mode.GetBits() & FileMode.TYPE_MASK) == FileMode.TYPE_FILE;