2
This code is derived from jgit (http://eclipse.org/jgit).
3
Copyright owners are documented in jgit's IP log.
5
This program and the accompanying materials are made available
6
under the terms of the Eclipse Distribution License v1.0 which
7
accompanies this distribution, is reproduced below, and is
8
available at http://www.eclipse.org/org/documents/edl-v10.php
12
Redistribution and use in source and binary forms, with or
13
without modification, are permitted provided that the following
16
- Redistributions of source code must retain the above copyright
17
notice, this list of conditions and the following disclaimer.
19
- Redistributions in binary form must reproduce the above
20
copyright notice, this list of conditions and the following
21
disclaimer in the documentation and/or other materials provided
22
with the distribution.
24
- Neither the name of the Eclipse Foundation, Inc. nor the
25
names of its contributors may be used to endorse or promote
26
products derived from this software without specific prior
29
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
30
CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
31
INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
32
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
34
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
35
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
36
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
37
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
38
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
39
STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
40
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
41
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
51
/// <summary>Index structure of lines/blocks in one file.</summary>
53
/// Index structure of lines/blocks in one file.
55
/// This structure can be used to compute an approximation of the similarity
56
/// between two files. The index is used by
57
/// <see cref="SimilarityRenameDetector">SimilarityRenameDetector</see>
59
/// compute scores between files.
61
/// To save space in memory, this index uses a space efficient encoding which
62
/// will not exceed 1 MiB per instance. The index starts out at a smaller size
63
/// (closer to 2 KiB), but may grow as more distinct blocks within the scanned
64
/// file are discovered.
66
internal class SimilarityIndex
70
/// <see cref="TableFullException">TableFullException</see>
71
/// used in place of OutOfMemoryError.
73
private static readonly SimilarityIndex.TableFullException TABLE_FULL_OUT_OF_MEMORY
74
= new SimilarityIndex.TableFullException();
76
/// <summary>Shift to apply before storing a key.</summary>
78
/// Shift to apply before storing a key.
80
/// Within the 64 bit table record space, we leave the highest bit unset so
81
/// all values are positive. The lower 32 bits to count bytes.
83
private const int KEY_SHIFT = 32;
85
/// <summary>Maximum value of the count field, also mask to extract the count.</summary>
86
/// <remarks>Maximum value of the count field, also mask to extract the count.</remarks>
87
private const long MAX_COUNT = (1L << KEY_SHIFT) - 1;
89
/// <summary>Total size of the file we hashed into the structure.</summary>
90
/// <remarks>Total size of the file we hashed into the structure.</remarks>
91
private long fileSize;
94
/// Number of non-zero entries in
95
/// <see cref="idHash">idHash</see>
101
/// <see cref="idSize">idSize</see>
103
/// <see cref="idHash">idHash</see>
104
/// to double in size.
106
private int idGrowAt;
108
/// <summary>Pairings of content keys and counters.</summary>
110
/// Pairings of content keys and counters.
112
/// Slots in the table are actually two ints wedged into a single long. The
113
/// upper 32 bits stores the content key, and the remaining lower bits stores
114
/// the number of bytes associated with that key. Empty slots are denoted by
115
/// 0, which cannot occur because the count cannot be 0. Values can only be
116
/// positive, which we enforce during key addition.
118
private long[] idHash;
121
/// <code>idHash.length == 1 << idHashBits</code>
124
private int idHashBits;
126
public SimilarityIndex()
129
idHash = new long[1 << idHashBits];
130
idGrowAt = GrowAt(idHashBits);
133
internal virtual long GetFileSize()
138
internal virtual void SetFileSize(long size)
143
/// <exception cref="NGit.Errors.MissingObjectException"></exception>
144
/// <exception cref="System.IO.IOException"></exception>
145
/// <exception cref="NGit.Diff.SimilarityIndex.TableFullException"></exception>
146
internal virtual void Hash(ObjectLoader obj)
150
ObjectStream @in = obj.OpenStream();
153
SetFileSize(@in.GetSize());
163
byte[] raw = obj.GetCachedBytes();
164
SetFileSize(raw.Length);
165
Hash(raw, 0, raw.Length);
169
/// <exception cref="NGit.Diff.SimilarityIndex.TableFullException"></exception>
170
internal virtual void Hash(byte[] raw, int ptr, int end)
178
// Hash one line, or one block, whichever occurs first.
179
int c = raw[ptr++] & unchecked((int)(0xff));
184
hash = (hash << 5) + hash + c;
186
while (ptr < end && ptr - start < 64);
187
Add(hash, ptr - start);
191
/// <exception cref="System.IO.IOException"></exception>
192
/// <exception cref="NGit.Diff.SimilarityIndex.TableFullException"></exception>
193
internal virtual void Hash(InputStream @in, long remaining)
195
byte[] buf = new byte[4096];
198
while (0 < remaining)
201
// Hash one line, or one block, whichever occurs first.
208
cnt = @in.Read(buf, 0, buf.Length);
211
throw new EOFException();
215
int c = buf[ptr++] & unchecked((int)(0xff));
220
hash = (hash << 5) + hash + c;
222
while (n < 64 && n < remaining);
228
/// <summary>Sort the internal table so it can be used for efficient scoring.</summary>
230
/// Sort the internal table so it can be used for efficient scoring.
232
/// Once sorted, additional lines/blocks cannot be added to the index.
234
internal virtual void Sort()
236
// Sort the array. All of the empty space will wind up at the front,
237
// because we forced all of the keys to always be positive. Later
238
// we only work with the back half of the array.
243
internal virtual int Score(NGit.Diff.SimilarityIndex dst, int maxScore)
245
long max = Math.Max(fileSize, dst.fileSize);
250
return (int)((Common(dst) * maxScore) / max);
253
internal virtual long Common(NGit.Diff.SimilarityIndex dst)
255
return Common(this, dst);
258
private static long Common(NGit.Diff.SimilarityIndex src, NGit.Diff.SimilarityIndex
261
int srcIdx = src.PackedIndex(0);
262
int dstIdx = dst.PackedIndex(0);
263
long[] srcHash = src.idHash;
264
long[] dstHash = dst.idHash;
265
return Common(srcHash, srcIdx, dstHash, dstIdx);
268
private static long Common(long[] srcHash, int srcIdx, long[] dstHash, int dstIdx
272
if (srcIdx == srcHash.Length || dstIdx == dstHash.Length)
277
int srcKey = KeyOf(srcHash[srcIdx]);
278
int dstKey = KeyOf(dstHash[dstIdx]);
281
if (srcKey == dstKey)
283
common += Math.Min(CountOf(srcHash[srcIdx]), CountOf(dstHash[dstIdx]));
284
if (++srcIdx == srcHash.Length)
288
srcKey = KeyOf(srcHash[srcIdx]);
289
if (++dstIdx == dstHash.Length)
293
dstKey = KeyOf(dstHash[dstIdx]);
299
// Regions of src which do not appear in dst.
300
if (++srcIdx == srcHash.Length)
304
srcKey = KeyOf(srcHash[srcIdx]);
308
// Regions of dst which do not appear in src.
309
if (++dstIdx == dstHash.Length)
313
dstKey = KeyOf(dstHash[dstIdx]);
321
internal virtual int Size()
327
internal virtual int Key(int idx)
329
return KeyOf(idHash[PackedIndex(idx)]);
333
internal virtual long Count(int idx)
335
return CountOf(idHash[PackedIndex(idx)]);
338
// Brute force approach only for testing.
339
internal virtual int FindIndex(int key)
341
for (int i = 0; i < idSize; i++)
351
private int PackedIndex(int idx)
353
return (idHash.Length - idSize) + idx;
356
/// <exception cref="NGit.Diff.SimilarityIndex.TableFullException"></exception>
357
internal virtual void Add(int key, int cnt)
359
key = (int)(((uint)(key * unchecked((int)(0x9e370001)))) >> 1);
360
// Mix bits and ensure not negative.
367
// Empty slot in the table, store here.
368
if (idGrowAt <= idSize)
374
idHash[j] = Pair(key, cnt);
382
// Same key, increment the counter. If it overflows, fail
383
// indexing to prevent the key from being impacted.
385
idHash[j] = Pair(key, CountOf(v) + cnt);
390
if (++j >= idHash.Length)
399
/// <exception cref="NGit.Diff.SimilarityIndex.TableFullException"></exception>
400
private static long Pair(int key, long cnt)
404
throw new SimilarityIndex.TableFullException();
406
return (((long)key) << KEY_SHIFT) | cnt;
409
private int Slot(int key)
411
// We use 31 - idHashBits because the upper bit was already forced
412
// to be 0 and we want the remaining high bits to be used as the
415
return (int)(((uint)key) >> (31 - idHashBits));
418
private static int GrowAt(int idHashBits)
420
return (1 << idHashBits) * (idHashBits - 3) / idHashBits;
423
/// <exception cref="NGit.Diff.SimilarityIndex.TableFullException"></exception>
426
if (idHashBits == 30)
428
throw new SimilarityIndex.TableFullException();
430
long[] oldHash = idHash;
431
int oldSize = idHash.Length;
433
idGrowAt = GrowAt(idHashBits);
436
idHash = new long[1 << idHashBits];
438
catch (OutOfMemoryException)
440
throw TABLE_FULL_OUT_OF_MEMORY;
442
for (int i = 0; i < oldSize; i++)
447
int j = Slot(KeyOf(v));
448
while (idHash[j] != 0)
450
if (++j >= idHash.Length)
460
private static int KeyOf(long v)
462
return (int)((long)(((ulong)v) >> KEY_SHIFT));
465
private static long CountOf(long v)
467
return v & MAX_COUNT;
470
[System.Serializable]
471
internal class TableFullException : Exception
473
private const long serialVersionUID = 1L;