~ubuntu-branches/ubuntu/oneiric/monodevelop/oneiric

« back to all changes in this revision

Viewing changes to contrib/NGit/NGit.Diff/SimilarityIndex.cs

  • Committer: Bazaar Package Importer
  • Author(s): Jo Shields
  • Date: 2011-06-27 17:03:13 UTC
  • mto: (1.8.1 upstream)
  • mto: This revision was merged to the branch mainline in revision 54.
  • Revision ID: james.westby@ubuntu.com-20110627170313-6cvz3s19x6e9hqe9
ImportĀ upstreamĀ versionĀ 2.5.92+dfsg

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*
 
2
This code is derived from jgit (http://eclipse.org/jgit).
 
3
Copyright owners are documented in jgit's IP log.
 
4
 
 
5
This program and the accompanying materials are made available
 
6
under the terms of the Eclipse Distribution License v1.0 which
 
7
accompanies this distribution, is reproduced below, and is
 
8
available at http://www.eclipse.org/org/documents/edl-v10.php
 
9
 
 
10
All rights reserved.
 
11
 
 
12
Redistribution and use in source and binary forms, with or
 
13
without modification, are permitted provided that the following
 
14
conditions are met:
 
15
 
 
16
- Redistributions of source code must retain the above copyright
 
17
  notice, this list of conditions and the following disclaimer.
 
18
 
 
19
- Redistributions in binary form must reproduce the above
 
20
  copyright notice, this list of conditions and the following
 
21
  disclaimer in the documentation and/or other materials provided
 
22
  with the distribution.
 
23
 
 
24
- Neither the name of the Eclipse Foundation, Inc. nor the
 
25
  names of its contributors may be used to endorse or promote
 
26
  products derived from this software without specific prior
 
27
  written permission.
 
28
 
 
29
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
 
30
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
 
31
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 
32
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 
33
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 
34
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 
35
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 
36
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 
37
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 
38
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 
39
STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 
40
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 
41
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
42
*/
 
43
 
 
44
using System;
 
45
using NGit;
 
46
using NGit.Diff;
 
47
using Sharpen;
 
48
 
 
49
namespace NGit.Diff
 
50
{
 
51
        /// <summary>Index structure of lines/blocks in one file.</summary>
 
52
        /// <remarks>
 
53
        /// Index structure of lines/blocks in one file.
 
54
        /// <p>
 
55
        /// This structure can be used to compute an approximation of the similarity
 
56
        /// between two files. The index is used by
 
57
        /// <see cref="SimilarityRenameDetector">SimilarityRenameDetector</see>
 
58
        /// to
 
59
        /// compute scores between files.
 
60
        /// <p>
 
61
        /// To save space in memory, this index uses a space efficient encoding which
 
62
        /// will not exceed 1 MiB per instance. The index starts out at a smaller size
 
63
        /// (closer to 2 KiB), but may grow as more distinct blocks within the scanned
 
64
        /// file are discovered.
 
65
        /// </remarks>
 
66
        internal class SimilarityIndex
 
67
        {
 
68
                /// <summary>
 
69
                /// A special
 
70
                /// <see cref="TableFullException">TableFullException</see>
 
71
                /// used in place of OutOfMemoryError.
 
72
                /// </summary>
 
73
                private static readonly SimilarityIndex.TableFullException TABLE_FULL_OUT_OF_MEMORY
 
74
                         = new SimilarityIndex.TableFullException();
 
75
 
 
76
                /// <summary>Shift to apply before storing a key.</summary>
 
77
                /// <remarks>
 
78
                /// Shift to apply before storing a key.
 
79
                /// <p>
 
80
                /// Within the 64 bit table record space, we leave the highest bit unset so
 
81
                /// all values are positive. The lower 32 bits to count bytes.
 
82
                /// </remarks>
 
83
                private const int KEY_SHIFT = 32;
 
84
 
 
85
                /// <summary>Maximum value of the count field, also mask to extract the count.</summary>
 
86
                /// <remarks>Maximum value of the count field, also mask to extract the count.</remarks>
 
87
                private const long MAX_COUNT = (1L << KEY_SHIFT) - 1;
 
88
 
 
89
                /// <summary>Total size of the file we hashed into the structure.</summary>
 
90
                /// <remarks>Total size of the file we hashed into the structure.</remarks>
 
91
                private long fileSize;
 
92
 
 
93
                /// <summary>
 
94
                /// Number of non-zero entries in
 
95
                /// <see cref="idHash">idHash</see>
 
96
                /// .
 
97
                /// </summary>
 
98
                private int idSize;
 
99
 
 
100
                /// <summary>
 
101
                /// <see cref="idSize">idSize</see>
 
102
                /// that triggers
 
103
                /// <see cref="idHash">idHash</see>
 
104
                /// to double in size.
 
105
                /// </summary>
 
106
                private int idGrowAt;
 
107
 
 
108
                /// <summary>Pairings of content keys and counters.</summary>
 
109
                /// <remarks>
 
110
                /// Pairings of content keys and counters.
 
111
                /// <p>
 
112
                /// Slots in the table are actually two ints wedged into a single long. The
 
113
                /// upper 32 bits stores the content key, and the remaining lower bits stores
 
114
                /// the number of bytes associated with that key. Empty slots are denoted by
 
115
                /// 0, which cannot occur because the count cannot be 0. Values can only be
 
116
                /// positive, which we enforce during key addition.
 
117
                /// </remarks>
 
118
                private long[] idHash;
 
119
 
 
120
                /// <summary>
 
121
                /// <code>idHash.length == 1 &lt;&lt; idHashBits</code>
 
122
                /// .
 
123
                /// </summary>
 
124
                private int idHashBits;
 
125
 
 
126
                public SimilarityIndex()
 
127
                {
 
128
                        idHashBits = 8;
 
129
                        idHash = new long[1 << idHashBits];
 
130
                        idGrowAt = GrowAt(idHashBits);
 
131
                }
 
132
 
 
133
                internal virtual long GetFileSize()
 
134
                {
 
135
                        return fileSize;
 
136
                }
 
137
 
 
138
                internal virtual void SetFileSize(long size)
 
139
                {
 
140
                        fileSize = size;
 
141
                }
 
142
 
 
143
                /// <exception cref="NGit.Errors.MissingObjectException"></exception>
 
144
                /// <exception cref="System.IO.IOException"></exception>
 
145
                /// <exception cref="NGit.Diff.SimilarityIndex.TableFullException"></exception>
 
146
                internal virtual void Hash(ObjectLoader obj)
 
147
                {
 
148
                        if (obj.IsLarge())
 
149
                        {
 
150
                                ObjectStream @in = obj.OpenStream();
 
151
                                try
 
152
                                {
 
153
                                        SetFileSize(@in.GetSize());
 
154
                                        Hash(@in, fileSize);
 
155
                                }
 
156
                                finally
 
157
                                {
 
158
                                        @in.Close();
 
159
                                }
 
160
                        }
 
161
                        else
 
162
                        {
 
163
                                byte[] raw = obj.GetCachedBytes();
 
164
                                SetFileSize(raw.Length);
 
165
                                Hash(raw, 0, raw.Length);
 
166
                        }
 
167
                }
 
168
 
 
169
                /// <exception cref="NGit.Diff.SimilarityIndex.TableFullException"></exception>
 
170
                internal virtual void Hash(byte[] raw, int ptr, int end)
 
171
                {
 
172
                        while (ptr < end)
 
173
                        {
 
174
                                int hash = 5381;
 
175
                                int start = ptr;
 
176
                                do
 
177
                                {
 
178
                                        // Hash one line, or one block, whichever occurs first.
 
179
                                        int c = raw[ptr++] & unchecked((int)(0xff));
 
180
                                        if (c == '\n')
 
181
                                        {
 
182
                                                break;
 
183
                                        }
 
184
                                        hash = (hash << 5) + hash + c;
 
185
                                }
 
186
                                while (ptr < end && ptr - start < 64);
 
187
                                Add(hash, ptr - start);
 
188
                        }
 
189
                }
 
190
 
 
191
                /// <exception cref="System.IO.IOException"></exception>
 
192
                /// <exception cref="NGit.Diff.SimilarityIndex.TableFullException"></exception>
 
193
                internal virtual void Hash(InputStream @in, long remaining)
 
194
                {
 
195
                        byte[] buf = new byte[4096];
 
196
                        int ptr = 0;
 
197
                        int cnt = 0;
 
198
                        while (0 < remaining)
 
199
                        {
 
200
                                int hash = 5381;
 
201
                                // Hash one line, or one block, whichever occurs first.
 
202
                                int n = 0;
 
203
                                do
 
204
                                {
 
205
                                        if (ptr == cnt)
 
206
                                        {
 
207
                                                ptr = 0;
 
208
                                                cnt = @in.Read(buf, 0, buf.Length);
 
209
                                                if (cnt <= 0)
 
210
                                                {
 
211
                                                        throw new EOFException();
 
212
                                                }
 
213
                                        }
 
214
                                        n++;
 
215
                                        int c = buf[ptr++] & unchecked((int)(0xff));
 
216
                                        if (c == '\n')
 
217
                                        {
 
218
                                                break;
 
219
                                        }
 
220
                                        hash = (hash << 5) + hash + c;
 
221
                                }
 
222
                                while (n < 64 && n < remaining);
 
223
                                Add(hash, n);
 
224
                                remaining -= n;
 
225
                        }
 
226
                }
 
227
 
 
228
                /// <summary>Sort the internal table so it can be used for efficient scoring.</summary>
 
229
                /// <remarks>
 
230
                /// Sort the internal table so it can be used for efficient scoring.
 
231
                /// <p>
 
232
                /// Once sorted, additional lines/blocks cannot be added to the index.
 
233
                /// </remarks>
 
234
                internal virtual void Sort()
 
235
                {
 
236
                        // Sort the array. All of the empty space will wind up at the front,
 
237
                        // because we forced all of the keys to always be positive. Later
 
238
                        // we only work with the back half of the array.
 
239
                        //
 
240
                        Arrays.Sort(idHash);
 
241
                }
 
242
 
 
243
                internal virtual int Score(NGit.Diff.SimilarityIndex dst, int maxScore)
 
244
                {
 
245
                        long max = Math.Max(fileSize, dst.fileSize);
 
246
                        if (max == 0)
 
247
                        {
 
248
                                return maxScore;
 
249
                        }
 
250
                        return (int)((Common(dst) * maxScore) / max);
 
251
                }
 
252
 
 
253
                internal virtual long Common(NGit.Diff.SimilarityIndex dst)
 
254
                {
 
255
                        return Common(this, dst);
 
256
                }
 
257
 
 
258
                private static long Common(NGit.Diff.SimilarityIndex src, NGit.Diff.SimilarityIndex
 
259
                         dst)
 
260
                {
 
261
                        int srcIdx = src.PackedIndex(0);
 
262
                        int dstIdx = dst.PackedIndex(0);
 
263
                        long[] srcHash = src.idHash;
 
264
                        long[] dstHash = dst.idHash;
 
265
                        return Common(srcHash, srcIdx, dstHash, dstIdx);
 
266
                }
 
267
 
 
268
                private static long Common(long[] srcHash, int srcIdx, long[] dstHash, int dstIdx
 
269
                        )
 
270
                {
 
271
                        //
 
272
                        if (srcIdx == srcHash.Length || dstIdx == dstHash.Length)
 
273
                        {
 
274
                                return 0;
 
275
                        }
 
276
                        long common = 0;
 
277
                        int srcKey = KeyOf(srcHash[srcIdx]);
 
278
                        int dstKey = KeyOf(dstHash[dstIdx]);
 
279
                        for (; ; )
 
280
                        {
 
281
                                if (srcKey == dstKey)
 
282
                                {
 
283
                                        common += Math.Min(CountOf(srcHash[srcIdx]), CountOf(dstHash[dstIdx]));
 
284
                                        if (++srcIdx == srcHash.Length)
 
285
                                        {
 
286
                                                break;
 
287
                                        }
 
288
                                        srcKey = KeyOf(srcHash[srcIdx]);
 
289
                                        if (++dstIdx == dstHash.Length)
 
290
                                        {
 
291
                                                break;
 
292
                                        }
 
293
                                        dstKey = KeyOf(dstHash[dstIdx]);
 
294
                                }
 
295
                                else
 
296
                                {
 
297
                                        if (srcKey < dstKey)
 
298
                                        {
 
299
                                                // Regions of src which do not appear in dst.
 
300
                                                if (++srcIdx == srcHash.Length)
 
301
                                                {
 
302
                                                        break;
 
303
                                                }
 
304
                                                srcKey = KeyOf(srcHash[srcIdx]);
 
305
                                        }
 
306
                                        else
 
307
                                        {
 
308
                                                // Regions of dst which do not appear in src.
 
309
                                                if (++dstIdx == dstHash.Length)
 
310
                                                {
 
311
                                                        break;
 
312
                                                }
 
313
                                                dstKey = KeyOf(dstHash[dstIdx]);
 
314
                                        }
 
315
                                }
 
316
                        }
 
317
                        return common;
 
318
                }
 
319
 
 
320
                // Testing only
 
321
                internal virtual int Size()
 
322
                {
 
323
                        return idSize;
 
324
                }
 
325
 
 
326
                // Testing only
 
327
                internal virtual int Key(int idx)
 
328
                {
 
329
                        return KeyOf(idHash[PackedIndex(idx)]);
 
330
                }
 
331
 
 
332
                // Testing only
 
333
                internal virtual long Count(int idx)
 
334
                {
 
335
                        return CountOf(idHash[PackedIndex(idx)]);
 
336
                }
 
337
 
 
338
                // Brute force approach only for testing.
 
339
                internal virtual int FindIndex(int key)
 
340
                {
 
341
                        for (int i = 0; i < idSize; i++)
 
342
                        {
 
343
                                if (Key(i) == key)
 
344
                                {
 
345
                                        return i;
 
346
                                }
 
347
                        }
 
348
                        return -1;
 
349
                }
 
350
 
 
351
                private int PackedIndex(int idx)
 
352
                {
 
353
                        return (idHash.Length - idSize) + idx;
 
354
                }
 
355
 
 
356
                /// <exception cref="NGit.Diff.SimilarityIndex.TableFullException"></exception>
 
357
                internal virtual void Add(int key, int cnt)
 
358
                {
 
359
                        key = (int)(((uint)(key * unchecked((int)(0x9e370001)))) >> 1);
 
360
                        // Mix bits and ensure not negative.
 
361
                        int j = Slot(key);
 
362
                        for (; ; )
 
363
                        {
 
364
                                long v = idHash[j];
 
365
                                if (v == 0)
 
366
                                {
 
367
                                        // Empty slot in the table, store here.
 
368
                                        if (idGrowAt <= idSize)
 
369
                                        {
 
370
                                                Grow();
 
371
                                                j = Slot(key);
 
372
                                                continue;
 
373
                                        }
 
374
                                        idHash[j] = Pair(key, cnt);
 
375
                                        idSize++;
 
376
                                        return;
 
377
                                }
 
378
                                else
 
379
                                {
 
380
                                        if (KeyOf(v) == key)
 
381
                                        {
 
382
                                                // Same key, increment the counter. If it overflows, fail
 
383
                                                // indexing to prevent the key from being impacted.
 
384
                                                //
 
385
                                                idHash[j] = Pair(key, CountOf(v) + cnt);
 
386
                                                return;
 
387
                                        }
 
388
                                        else
 
389
                                        {
 
390
                                                if (++j >= idHash.Length)
 
391
                                                {
 
392
                                                        j = 0;
 
393
                                                }
 
394
                                        }
 
395
                                }
 
396
                        }
 
397
                }
 
398
 
 
399
                /// <exception cref="NGit.Diff.SimilarityIndex.TableFullException"></exception>
 
400
                private static long Pair(int key, long cnt)
 
401
                {
 
402
                        if (MAX_COUNT < cnt)
 
403
                        {
 
404
                                throw new SimilarityIndex.TableFullException();
 
405
                        }
 
406
                        return (((long)key) << KEY_SHIFT) | cnt;
 
407
                }
 
408
 
 
409
                private int Slot(int key)
 
410
                {
 
411
                        // We use 31 - idHashBits because the upper bit was already forced
 
412
                        // to be 0 and we want the remaining high bits to be used as the
 
413
                        // table slot.
 
414
                        //
 
415
                        return (int)(((uint)key) >> (31 - idHashBits));
 
416
                }
 
417
 
 
418
                private static int GrowAt(int idHashBits)
 
419
                {
 
420
                        return (1 << idHashBits) * (idHashBits - 3) / idHashBits;
 
421
                }
 
422
 
 
423
                /// <exception cref="NGit.Diff.SimilarityIndex.TableFullException"></exception>
 
424
                private void Grow()
 
425
                {
 
426
                        if (idHashBits == 30)
 
427
                        {
 
428
                                throw new SimilarityIndex.TableFullException();
 
429
                        }
 
430
                        long[] oldHash = idHash;
 
431
                        int oldSize = idHash.Length;
 
432
                        idHashBits++;
 
433
                        idGrowAt = GrowAt(idHashBits);
 
434
                        try
 
435
                        {
 
436
                                idHash = new long[1 << idHashBits];
 
437
                        }
 
438
                        catch (OutOfMemoryException)
 
439
                        {
 
440
                                throw TABLE_FULL_OUT_OF_MEMORY;
 
441
                        }
 
442
                        for (int i = 0; i < oldSize; i++)
 
443
                        {
 
444
                                long v = oldHash[i];
 
445
                                if (v != 0)
 
446
                                {
 
447
                                        int j = Slot(KeyOf(v));
 
448
                                        while (idHash[j] != 0)
 
449
                                        {
 
450
                                                if (++j >= idHash.Length)
 
451
                                                {
 
452
                                                        j = 0;
 
453
                                                }
 
454
                                        }
 
455
                                        idHash[j] = v;
 
456
                                }
 
457
                        }
 
458
                }
 
459
 
 
460
                private static int KeyOf(long v)
 
461
                {
 
462
                        return (int)((long)(((ulong)v) >> KEY_SHIFT));
 
463
                }
 
464
 
 
465
                private static long CountOf(long v)
 
466
                {
 
467
                        return v & MAX_COUNT;
 
468
                }
 
469
 
 
470
                [System.Serializable]
 
471
                internal class TableFullException : Exception
 
472
                {
 
473
                        private const long serialVersionUID = 1L;
 
474
                }
 
475
        }
 
476
}