~ubuntu-branches/ubuntu/oneiric/monodevelop/oneiric

« back to all changes in this revision

Viewing changes to contrib/NGit/NGit.Diff/SimilarityRenameDetector.cs

  • Committer: Bazaar Package Importer
  • Author(s): Jo Shields
  • Date: 2011-06-27 17:03:13 UTC
  • mto: (1.8.1 upstream)
  • mto: This revision was merged to the branch mainline in revision 54.
  • Revision ID: james.westby@ubuntu.com-20110627170313-6cvz3s19x6e9hqe9
ImportĀ upstreamĀ versionĀ 2.5.92+dfsg

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*
 
2
This code is derived from jgit (http://eclipse.org/jgit).
 
3
Copyright owners are documented in jgit's IP log.
 
4
 
 
5
This program and the accompanying materials are made available
 
6
under the terms of the Eclipse Distribution License v1.0 which
 
7
accompanies this distribution, is reproduced below, and is
 
8
available at http://www.eclipse.org/org/documents/edl-v10.php
 
9
 
 
10
All rights reserved.
 
11
 
 
12
Redistribution and use in source and binary forms, with or
 
13
without modification, are permitted provided that the following
 
14
conditions are met:
 
15
 
 
16
- Redistributions of source code must retain the above copyright
 
17
  notice, this list of conditions and the following disclaimer.
 
18
 
 
19
- Redistributions in binary form must reproduce the above
 
20
  copyright notice, this list of conditions and the following
 
21
  disclaimer in the documentation and/or other materials provided
 
22
  with the distribution.
 
23
 
 
24
- Neither the name of the Eclipse Foundation, Inc. nor the
 
25
  names of its contributors may be used to endorse or promote
 
26
  products derived from this software without specific prior
 
27
  written permission.
 
28
 
 
29
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
 
30
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 
31
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 
32
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 
33
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 
34
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 
35
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 
36
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 
37
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 
38
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 
39
STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 
40
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 
41
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
42
*/
 
43
 
 
44
using System;
 
45
using System.Collections.Generic;
 
46
using NGit;
 
47
using NGit.Diff;
 
48
using Sharpen;
 
49
 
 
50
namespace NGit.Diff
 
51
{
 
52
        internal class SimilarityRenameDetector
 
53
        {
 
54
                /// <summary>Number of bits we need to express an index into src or dst list.</summary>
 
55
                /// <remarks>
 
56
                /// Number of bits we need to express an index into src or dst list.
 
57
                /// <p>
 
58
                /// This must be 28, giving us a limit of 2^28 entries in either list, which
 
59
                /// is an insane limit of 536,870,912 file names being considered in a single
 
60
                /// rename pass. The other 8 bits are used to store the score, while staying
 
61
                /// under 127 so the long doesn't go negative.
 
62
                /// </remarks>
 
63
                private const int BITS_PER_INDEX = 28;
 
64
 
 
65
                private const int INDEX_MASK = (1 << BITS_PER_INDEX) - 1;
 
66
 
 
67
                private const int SCORE_SHIFT = 2 * BITS_PER_INDEX;
 
68
 
 
69
                private ContentSource.Pair reader;
 
70
 
 
71
                /// <summary>All sources to consider for copies or renames.</summary>
 
72
                /// <remarks>
 
73
                /// All sources to consider for copies or renames.
 
74
                /// <p>
 
75
                /// A source is typically a
 
76
                /// <see cref="ChangeType.DELETE">ChangeType.DELETE</see>
 
77
                /// change, but could be
 
78
                /// another type when trying to perform copy detection concurrently with
 
79
                /// rename detection.
 
80
                /// </remarks>
 
81
                private IList<DiffEntry> srcs;
 
82
 
 
83
                /// <summary>All destinations to consider looking for a rename.</summary>
 
84
                /// <remarks>
 
85
                /// All destinations to consider looking for a rename.
 
86
                /// <p>
 
87
                /// A destination is typically an
 
88
                /// <see cref="ChangeType.ADD">ChangeType.ADD</see>
 
89
                /// , as the name has
 
90
                /// just come into existence, and we want to discover where its initial
 
91
                /// content came from.
 
92
                /// </remarks>
 
93
                private IList<DiffEntry> dsts;
 
94
 
 
95
                /// <summary>Matrix of all examined file pairs, and their scores.</summary>
 
96
                /// <remarks>
 
97
                /// Matrix of all examined file pairs, and their scores.
 
98
                /// <p>
 
99
                /// The upper 8 bits of each long stores the score, but the score is bounded
 
100
                /// to be in the range (0, 128] so that the highest bit is never set, and all
 
101
                /// entries are therefore positive.
 
102
                /// <p>
 
103
                /// List indexes to an element of
 
104
                /// <see cref="srcs">srcs</see>
 
105
                /// and
 
106
                /// <see cref="dsts">dsts</see>
 
107
                /// are encoded
 
108
                /// as the lower two groups of 28 bits, respectively, but the encoding is
 
109
                /// inverted, so that 0 is expressed as
 
110
                /// <code>(1 &lt;&lt; 28) - 1</code>
 
111
                /// . This sorts
 
112
                /// lower list indices later in the matrix, giving precedence to files whose
 
113
                /// names sort earlier in the tree.
 
114
                /// </remarks>
 
115
                private long[] matrix;
 
116
 
 
117
                /// <summary>Score a pair must exceed to be considered a rename.</summary>
 
118
                /// <remarks>Score a pair must exceed to be considered a rename.</remarks>
 
119
                private int renameScore = 60;
 
120
 
 
121
                /// <summary>
 
122
                /// Set if any
 
123
                /// <see cref="TableFullException">TableFullException</see>
 
124
                /// occurs.
 
125
                /// </summary>
 
126
                private bool tableOverflow;
 
127
 
 
128
                private IList<DiffEntry> @out;
 
129
 
 
130
                internal SimilarityRenameDetector(ContentSource.Pair reader, IList<DiffEntry> srcs
 
131
                        , IList<DiffEntry> dsts)
 
132
                {
 
133
                        this.reader = reader;
 
134
                        this.srcs = srcs;
 
135
                        this.dsts = dsts;
 
136
                }
 
137
 
 
138
                internal virtual void SetRenameScore(int score)
 
139
                {
 
140
                        renameScore = score;
 
141
                }
 
142
 
 
143
                /// <exception cref="System.IO.IOException"></exception>
 
144
                internal virtual void Compute(ProgressMonitor pm)
 
145
                {
 
146
                        if (pm == null)
 
147
                        {
 
148
                                pm = NullProgressMonitor.INSTANCE;
 
149
                        }
 
150
                        pm.BeginTask(JGitText.Get().renamesFindingByContent, 2 * srcs.Count * dsts.Count);
 
151
                        //
 
152
                        int mNext = BuildMatrix(pm);
 
153
                        @out = new AList<DiffEntry>(Math.Min(mNext, dsts.Count));
 
154
                        // Match rename pairs on a first come, first serve basis until
 
155
                        // we have looked at everything that is above our minimum score.
 
156
                        //
 
157
                        for (--mNext; mNext >= 0; mNext--)
 
158
                        {
 
159
                                long ent = matrix[mNext];
 
160
                                int sIdx = SrcFile(ent);
 
161
                                int dIdx = DstFile(ent);
 
162
                                DiffEntry s = srcs[sIdx];
 
163
                                DiffEntry d = dsts[dIdx];
 
164
                                if (d == null)
 
165
                                {
 
166
                                        pm.Update(1);
 
167
                                        continue;
 
168
                                }
 
169
                                // was already matched earlier
 
170
                                DiffEntry.ChangeType type;
 
171
                                if (s.changeType == DiffEntry.ChangeType.DELETE)
 
172
                                {
 
173
                                        // First use of this source file. Tag it as a rename so we
 
174
                                        // later know it is already been used as a rename, other
 
175
                                        // matches (if any) will claim themselves as copies instead.
 
176
                                        //
 
177
                                        s.changeType = DiffEntry.ChangeType.RENAME;
 
178
                                        type = DiffEntry.ChangeType.RENAME;
 
179
                                }
 
180
                                else
 
181
                                {
 
182
                                        type = DiffEntry.ChangeType.COPY;
 
183
                                }
 
184
                                @out.AddItem(DiffEntry.Pair(type, s, d, Score(ent)));
 
185
                                dsts.Set(dIdx, null);
 
186
                                // Claim the destination was matched.
 
187
                                pm.Update(1);
 
188
                        }
 
189
                        srcs = CompactSrcList(srcs);
 
190
                        dsts = CompactDstList(dsts);
 
191
                        pm.EndTask();
 
192
                }
 
193
 
 
194
                internal virtual IList<DiffEntry> GetMatches()
 
195
                {
 
196
                        return @out;
 
197
                }
 
198
 
 
199
                internal virtual IList<DiffEntry> GetLeftOverSources()
 
200
                {
 
201
                        return srcs;
 
202
                }
 
203
 
 
204
                internal virtual IList<DiffEntry> GetLeftOverDestinations()
 
205
                {
 
206
                        return dsts;
 
207
                }
 
208
 
 
209
                internal virtual bool IsTableOverflow()
 
210
                {
 
211
                        return tableOverflow;
 
212
                }
 
213
 
 
214
                private static IList<DiffEntry> CompactSrcList(IList<DiffEntry> @in)
 
215
                {
 
216
                        AList<DiffEntry> r = new AList<DiffEntry>(@in.Count);
 
217
                        foreach (DiffEntry e in @in)
 
218
                        {
 
219
                                if (e.changeType == DiffEntry.ChangeType.DELETE)
 
220
                                {
 
221
                                        r.AddItem(e);
 
222
                                }
 
223
                        }
 
224
                        return r;
 
225
                }
 
226
 
 
227
                private static IList<DiffEntry> CompactDstList(IList<DiffEntry> @in)
 
228
                {
 
229
                        AList<DiffEntry> r = new AList<DiffEntry>(@in.Count);
 
230
                        foreach (DiffEntry e in @in)
 
231
                        {
 
232
                                if (e != null)
 
233
                                {
 
234
                                        r.AddItem(e);
 
235
                                }
 
236
                        }
 
237
                        return r;
 
238
                }
 
239
 
 
240
                /// <exception cref="System.IO.IOException"></exception>
 
241
                private int BuildMatrix(ProgressMonitor pm)
 
242
                {
 
243
                        // Allocate for the worst-case scenario where every pair has a
 
244
                        // score that we need to consider. We might not need that many.
 
245
                        //
 
246
                        matrix = new long[srcs.Count * dsts.Count];
 
247
                        long[] srcSizes = new long[srcs.Count];
 
248
                        long[] dstSizes = new long[dsts.Count];
 
249
                        BitSet dstTooLarge = null;
 
250
                        // Consider each pair of files, if the score is above the minimum
 
251
                        // threshold we need record that scoring in the matrix so we can
 
252
                        // later find the best matches.
 
253
                        //
 
254
                        int mNext = 0;
 
255
                        for (int srcIdx = 0; srcIdx < srcs.Count; srcIdx++)
 
256
                        {
 
257
                                DiffEntry srcEnt = srcs[srcIdx];
 
258
                                if (!IsFile(srcEnt.oldMode))
 
259
                                {
 
260
                                        pm.Update(dsts.Count);
 
261
                                        continue;
 
262
                                }
 
263
                                SimilarityIndex s = null;
 
264
                                for (int dstIdx = 0; dstIdx < dsts.Count; dstIdx++)
 
265
                                {
 
266
                                        DiffEntry dstEnt = dsts[dstIdx];
 
267
                                        if (!IsFile(dstEnt.newMode))
 
268
                                        {
 
269
                                                pm.Update(1);
 
270
                                                continue;
 
271
                                        }
 
272
                                        if (!RenameDetector.SameType(srcEnt.oldMode, dstEnt.newMode))
 
273
                                        {
 
274
                                                pm.Update(1);
 
275
                                                continue;
 
276
                                        }
 
277
                                        if (dstTooLarge != null && dstTooLarge.Get(dstIdx))
 
278
                                        {
 
279
                                                pm.Update(1);
 
280
                                                continue;
 
281
                                        }
 
282
                                        long srcSize = srcSizes[srcIdx];
 
283
                                        if (srcSize == 0)
 
284
                                        {
 
285
                                                srcSize = Size(DiffEntry.Side.OLD, srcEnt) + 1;
 
286
                                                srcSizes[srcIdx] = srcSize;
 
287
                                        }
 
288
                                        long dstSize = dstSizes[dstIdx];
 
289
                                        if (dstSize == 0)
 
290
                                        {
 
291
                                                dstSize = Size(DiffEntry.Side.NEW, dstEnt) + 1;
 
292
                                                dstSizes[dstIdx] = dstSize;
 
293
                                        }
 
294
                                        long max = Math.Max(srcSize, dstSize);
 
295
                                        long min = Math.Min(srcSize, dstSize);
 
296
                                        if (min * 100 / max < renameScore)
 
297
                                        {
 
298
                                                // Cannot possibly match, as the file sizes are so different
 
299
                                                pm.Update(1);
 
300
                                                continue;
 
301
                                        }
 
302
                                        if (s == null)
 
303
                                        {
 
304
                                                try
 
305
                                                {
 
306
                                                        s = Hash(DiffEntry.Side.OLD, srcEnt);
 
307
                                                }
 
308
                                                catch (SimilarityIndex.TableFullException)
 
309
                                                {
 
310
                                                        tableOverflow = true;
 
311
                                                        goto SRC_continue;
 
312
                                                }
 
313
                                        }
 
314
                                        SimilarityIndex d;
 
315
                                        try
 
316
                                        {
 
317
                                                d = Hash(DiffEntry.Side.NEW, dstEnt);
 
318
                                        }
 
319
                                        catch (SimilarityIndex.TableFullException)
 
320
                                        {
 
321
                                                if (dstTooLarge == null)
 
322
                                                {
 
323
                                                        dstTooLarge = new BitSet(dsts.Count);
 
324
                                                }
 
325
                                                dstTooLarge.Set(dstIdx);
 
326
                                                tableOverflow = true;
 
327
                                                pm.Update(1);
 
328
                                                continue;
 
329
                                        }
 
330
                                        int contentScore = s.Score(d, 10000);
 
331
                                        // nameScore returns a value between 0 and 100, but we want it
 
332
                                        // to be in the same range as the content score. This allows it
 
333
                                        // to be dropped into the pretty formula for the final score.
 
334
                                        int nameScore = NameScore(srcEnt.oldPath, dstEnt.newPath) * 100;
 
335
                                        int score = (contentScore * 99 + nameScore * 1) / 10000;
 
336
                                        if (score < renameScore)
 
337
                                        {
 
338
                                                pm.Update(1);
 
339
                                                continue;
 
340
                                        }
 
341
                                        matrix[mNext++] = Encode(score, srcIdx, dstIdx);
 
342
                                        pm.Update(1);
 
343
                                }
 
344
SRC_continue: ;
 
345
                        }
 
346
SRC_break: ;
 
347
                        // Sort everything in the range we populated, which might be the
 
348
                        // entire matrix, or just a smaller slice if we had some bad low
 
349
                        // scoring pairs.
 
350
                        //
 
351
                        Arrays.Sort(matrix, 0, mNext);
 
352
                        return mNext;
 
353
                }
 
354
 
 
355
                internal static int NameScore(string a, string b)
 
356
                {
 
357
                        int aDirLen = a.LastIndexOf("/") + 1;
 
358
                        int bDirLen = b.LastIndexOf("/") + 1;
 
359
                        int dirMin = Math.Min(aDirLen, bDirLen);
 
360
                        int dirMax = Math.Max(aDirLen, bDirLen);
 
361
                        int dirScoreLtr;
 
362
                        int dirScoreRtl;
 
363
                        if (dirMax == 0)
 
364
                        {
 
365
                                dirScoreLtr = 100;
 
366
                                dirScoreRtl = 100;
 
367
                        }
 
368
                        else
 
369
                        {
 
370
                                int dirSim = 0;
 
371
                                for (; dirSim < dirMin; dirSim++)
 
372
                                {
 
373
                                        if (a[dirSim] != b[dirSim])
 
374
                                        {
 
375
                                                break;
 
376
                                        }
 
377
                                }
 
378
                                dirScoreLtr = (dirSim * 100) / dirMax;
 
379
                                if (dirScoreLtr == 100)
 
380
                                {
 
381
                                        dirScoreRtl = 100;
 
382
                                }
 
383
                                else
 
384
                                {
 
385
                                        for (dirSim = 0; dirSim < dirMin; dirSim++)
 
386
                                        {
 
387
                                                if (a[aDirLen - 1 - dirSim] != b[bDirLen - 1 - dirSim])
 
388
                                                {
 
389
                                                        break;
 
390
                                                }
 
391
                                        }
 
392
                                        dirScoreRtl = (dirSim * 100) / dirMax;
 
393
                                }
 
394
                        }
 
395
                        int fileMin = Math.Min(a.Length - aDirLen, b.Length - bDirLen);
 
396
                        int fileMax = Math.Max(a.Length - aDirLen, b.Length - bDirLen);
 
397
                        int fileSim = 0;
 
398
                        for (; fileSim < fileMin; fileSim++)
 
399
                        {
 
400
                                if (a[a.Length - 1 - fileSim] != b[b.Length - 1 - fileSim])
 
401
                                {
 
402
                                        break;
 
403
                                }
 
404
                        }
 
405
                        int fileScore = (fileSim * 100) / fileMax;
 
406
                        return (((dirScoreLtr + dirScoreRtl) * 25) + (fileScore * 50)) / 100;
 
407
                }
 
408
 
 
409
                /// <exception cref="System.IO.IOException"></exception>
 
410
                /// <exception cref="NGit.Diff.SimilarityIndex.TableFullException"></exception>
 
411
                private SimilarityIndex Hash(DiffEntry.Side side, DiffEntry ent)
 
412
                {
 
413
                        SimilarityIndex r = new SimilarityIndex();
 
414
                        r.Hash(reader.Open(side, ent));
 
415
                        r.Sort();
 
416
                        return r;
 
417
                }
 
418
 
 
419
                /// <exception cref="System.IO.IOException"></exception>
 
420
                private long Size(DiffEntry.Side side, DiffEntry ent)
 
421
                {
 
422
                        return reader.Size(side, ent);
 
423
                }
 
424
 
 
425
                private static int Score(long value)
 
426
                {
 
427
                        return (int)((long)(((ulong)value) >> SCORE_SHIFT));
 
428
                }
 
429
 
 
430
                internal static int SrcFile(long value)
 
431
                {
 
432
                        return DecodeFile(((int)((long)(((ulong)value) >> BITS_PER_INDEX))) & INDEX_MASK);
 
433
                }
 
434
 
 
435
                internal static int DstFile(long value)
 
436
                {
 
437
                        return DecodeFile(((int)value) & INDEX_MASK);
 
438
                }
 
439
 
 
440
                internal static long Encode(int score, int srcIdx, int dstIdx)
 
441
                {
 
442
                        return (((long)score) << SCORE_SHIFT) | (EncodeFile(srcIdx) << BITS_PER_INDEX) | 
 
443
                                EncodeFile(dstIdx);
 
444
                }
 
445
 
 
446
                //
 
447
                //
 
448
                private static long EncodeFile(int idx)
 
449
                {
 
450
                        // We invert the index so that the first file in the list sorts
 
451
                        // later in the table. This permits us to break ties favoring
 
452
                        // earlier names over later ones.
 
453
                        //
 
454
                        return INDEX_MASK - idx;
 
455
                }
 
456
 
 
457
                private static int DecodeFile(int v)
 
458
                {
 
459
                        return INDEX_MASK - v;
 
460
                }
 
461
 
 
462
                private static bool IsFile(FileMode mode)
 
463
                {
 
464
                        return (mode.GetBits() & FileMode.TYPE_MASK) == FileMode.TYPE_FILE;
 
465
                }
 
466
        }
 
467
}