1
package com.fasterxml.jackson.core.sym;
3
import java.util.Arrays;
5
import com.fasterxml.jackson.core.util.InternCache;
8
* This class is a kind of specialized type-safe Map, from char array to
9
* String value. Specialization means that in addition to type-safety
10
* and specific access patterns (key char array, Value optionally interned
11
* String; values added on access if necessary), and that instances are
12
* meant to be used concurrently, but by using well-defined mechanisms
13
* to obtain such concurrently usable instances. Main use for the class
14
* is to store symbol table information for things like compilers and
15
* parsers; especially when number of symbols (keywords) is limited.
17
* For optimal performance, usage pattern should be one where matches
18
* should be very common (especially after "warm-up"), and as with most hash-based
19
* maps/sets, that hash codes are uniformly distributed. Also, collisions
20
* are slightly more expensive than with HashMap or HashSet, since hash codes
21
* are not used in resolving collisions; that is, equals() comparison is
22
* done with all symbols in same bucket index.<br />
23
* Finally, rehashing is also more expensive, as hash codes are not
24
* stored; rehashing requires all entries' hash codes to be recalculated.
25
* Reason for not storing hash codes is reduced memory usage, hoping
26
* for better memory locality.
28
* Usual usage pattern is to create a single "master" instance, and either
29
* use that instance in sequential fashion, or to create derived "child"
30
* instances, which after use, are asked to return possible symbol additions
31
* to master instance. In either case benefit is that symbol table gets
32
* initialized so that further uses are more efficient, as eventually all
33
* symbols needed will already be in symbol table. At that point no more
34
* Symbol String allocations are needed, nor changes to symbol table itself.
36
* Note that while individual SymbolTable instances are NOT thread-safe
37
* (much like generic collection classes), concurrently used "child"
38
* instances can be freely used without synchronization. However, using
39
* master table concurrently with child instances can only be done if
40
* access to master instance is read-only (i.e. no modifications done).
42
public final class CharsToNameCanonicalizer
44
/* If we use "multiply-add" based hash algorithm, this is the multiplier
47
public final static int HASH_MULT = 33;
50
* Default initial table size. Shouldn't be miniscule (as there's
51
* cost to both array realloc and rehashing), but let's keep
52
* it reasonably small nonetheless. For systems that properly
53
* reuse factories it doesn't matter either way; but when
54
* recreating factories often, initial overhead may dominate.
56
protected static final int DEFAULT_TABLE_SIZE = 64;
59
* Let's not expand symbol tables past some maximum size;
60
* this should protected against OOMEs caused by large documents
61
* with uniquer (~= random) names.
63
protected static final int MAX_TABLE_SIZE = 0x10000; // 64k entries == 256k mem
66
* Let's only share reasonably sized symbol tables. Max size set to 3/4 of 16k;
67
* this corresponds to 64k main hash index. This should allow for enough distinct
68
* names for almost any case.
70
final static int MAX_ENTRIES_FOR_REUSE = 12000;
73
* Also: to thwart attacks based on hash collisions (which may or may not
74
* be cheap to calculate), we will need to detect "too long"
75
* collision chains. Let's start with static value of 255 entries
76
* for the longest legal chain.
78
* Note: longest chain we have been able to produce without malicious
79
* intent has been 38 (with "com.fasterxml.jackson.core.main.TestWithTonsaSymbols");
80
* our setting should be reasonable here.
84
final static int MAX_COLL_CHAIN_LENGTH = 255;
87
* And to support reduce likelihood of accidental collisons causing
88
* exceptions, let's prevent reuse of tables with long collision
89
* overflow lists as well.
93
final static int MAX_COLL_CHAIN_FOR_REUSE = 63;
95
final static CharsToNameCanonicalizer sBootstrapSymbolTable;
97
sBootstrapSymbolTable = new CharsToNameCanonicalizer();
101
/**********************************************************
103
/**********************************************************
107
* Sharing of learnt symbols is done by optional linking of symbol
108
* table instances with their parents. When parent linkage is
109
* defined, and child instance is released (call to <code>release</code>),
110
* parent's shared tables may be updated from the child instance.
112
protected CharsToNameCanonicalizer _parent;
115
* Seed value we use as the base to make hash codes non-static between
116
* different runs, but still stable for lifetime of a single symbol table
118
* This is done for security reasons, to avoid potential DoS attack via
123
final private int _hashSeed;
126
* Whether canonical symbol Strings are to be intern()ed before added
127
* to the table or not
129
final protected boolean _intern;
132
* Whether any canonicalization should be attempted (whether using
135
final protected boolean _canonicalize;
138
/**********************************************************
139
/* Actual symbol table data
140
/**********************************************************
144
* Primary matching symbols; it's expected most match occur from
147
protected String[] _symbols;
150
* Overflow buckets; if primary doesn't match, lookup is done
153
* Note: Number of buckets is half of number of symbol entries, on
154
* assumption there's less need for buckets.
156
protected Bucket[] _buckets;
159
* Current size (number of entries); needed to know if and when
165
* Limit that indicates maximum size this instance can hold before
166
* it needs to be expanded and rehashed. Calculated using fill
167
* factor passed in to constructor.
169
protected int _sizeThreshold;
172
* Mask used to get index from hash values; equal to
173
* <code>_buckets.length - 1</code>, when _buckets.length is
176
protected int _indexMask;
179
* We need to keep track of the longest collision list; this is needed
180
* both to indicate problems with attacks and to allow flushing for
185
protected int _longestCollisionList;
188
/**********************************************************
189
/* State regarding shared arrays
190
/**********************************************************
194
* Flag that indicates if any changes have been made to the data;
195
* used to both determine if bucket array needs to be copied when
196
* (first) change is made, and potentially if updated bucket list
197
* is to be resync'ed back to master instance.
199
protected boolean _dirty;
202
/**********************************************************
204
/**********************************************************
208
* Method called to create root canonicalizer for a {@link com.fasterxml.jackson.core.JsonFactory}
209
* instance. Root instance is never used directly; its main use is for
210
* storing and sharing underlying symbol arrays as needed.
212
public static CharsToNameCanonicalizer createRoot()
214
/* [Issue-21]: Need to use a variable seed, to thwart hash-collision
217
long now = System.currentTimeMillis();
218
// ensure it's not 0; and might as well require to be odd so:
219
int seed = (((int) now) + ((int) (now >>> 32))) | 1;
220
return createRoot(seed);
223
protected static CharsToNameCanonicalizer createRoot(int hashSeed) {
224
return sBootstrapSymbolTable.makeOrphan(hashSeed);
228
* Main method for constructing a master symbol table instance.
230
* @param initialSize Minimum initial size for bucket array; internally
231
* will always use a power of two equal to or bigger than this value.
233
private CharsToNameCanonicalizer()
235
// these settings don't really matter for the bootstrap instance
236
_canonicalize = true;
238
// And we'll also set flags so no copying of buckets is needed:
241
_longestCollisionList = 0;
242
initTables(DEFAULT_TABLE_SIZE);
245
private void initTables(int initialSize)
247
_symbols = new String[initialSize];
248
_buckets = new Bucket[initialSize >> 1];
249
// Mask is easy to calc for powers of two.
250
_indexMask = initialSize - 1;
252
_longestCollisionList = 0;
253
// Hard-coded fill factor is 75%
254
_sizeThreshold = _thresholdSize(initialSize);
257
private static int _thresholdSize(int hashAreaSize) {
258
return hashAreaSize - (hashAreaSize >> 2);
262
* Internal constructor used when creating child instances.
264
private CharsToNameCanonicalizer(CharsToNameCanonicalizer parent,
265
boolean canonicalize, boolean intern,
266
String[] symbols, Bucket[] buckets, int size,
267
int hashSeed, int longestColl)
270
_canonicalize = canonicalize;
276
_hashSeed = hashSeed;
277
// Hard-coded fill factor, 75%
278
int arrayLen = (symbols.length);
279
_sizeThreshold = _thresholdSize(arrayLen);
280
_indexMask = (arrayLen - 1);
281
_longestCollisionList = longestColl;
283
// Need to make copies of arrays, if/when adding new entries
288
* "Factory" method; will create a new child instance of this symbol
289
* table. It will be a copy-on-write instance, ie. it will only use
290
* read-only copy of parent's data, but when changes are needed, a
291
* copy will be created.
293
* Note: while this method is synchronized, it is generally not
294
* safe to both use makeChild/mergeChild, AND to use instance
295
* actively. Instead, a separate 'root' instance should be used
296
* on which only makeChild/mergeChild are called, but instance itself
297
* is not used as a symbol table.
299
public CharsToNameCanonicalizer makeChild(final boolean canonicalize,
300
final boolean intern)
302
/* 24-Jul-2012, tatu: Trying to reduce scope of synchronization, assuming
303
* that synchronizing construction is the (potentially) expensive part,
304
* and not so much short copy-the-variables thing.
306
final String[] symbols;
307
final Bucket[] buckets;
310
final int longestCollisionList;
312
synchronized (this) {
316
hashSeed = _hashSeed;
317
longestCollisionList = _longestCollisionList;
320
return new CharsToNameCanonicalizer(this, canonicalize, intern,
321
symbols, buckets, size, hashSeed, longestCollisionList);
324
private CharsToNameCanonicalizer makeOrphan(int seed)
326
return new CharsToNameCanonicalizer(null, true, true,
327
_symbols, _buckets, _size, seed, _longestCollisionList);
331
* Method that allows contents of child table to potentially be
332
* "merged in" with contents of this symbol table.
334
* Note that caller has to make sure symbol table passed in is
335
* really a child or sibling of this symbol table.
337
private void mergeChild(CharsToNameCanonicalizer child)
339
/* One caveat: let's try to avoid problems with
340
* degenerate cases of documents with generated "random"
341
* names: for these, symbol tables would bloat indefinitely.
342
* One way to do this is to just purge tables if they grow
343
* too large, and that's what we'll do here.
345
if (child.size() > MAX_ENTRIES_FOR_REUSE
346
|| child._longestCollisionList > MAX_COLL_CHAIN_FOR_REUSE) {
347
// Should there be a way to get notified about this event, to log it or such?
348
// (as it's somewhat abnormal thing to happen)
349
// At any rate, need to clean up the tables, then:
350
synchronized (this) {
351
initTables(DEFAULT_TABLE_SIZE);
352
// Dirty flag... well, let's just clear it. Shouldn't really matter for master tables
353
// (which this is, given something is merged to it)
357
// Otherwise, we'll merge changed stuff in, if there are more entries (which
358
// may not be the case if one of siblings has added symbols first or such)
359
if (child.size() <= size()) { // nothing to add
362
// Okie dokie, let's get the data in!
363
synchronized (this) {
364
_symbols = child._symbols;
365
_buckets = child._buckets;
367
_sizeThreshold = child._sizeThreshold;
368
_indexMask = child._indexMask;
369
_longestCollisionList = child._longestCollisionList;
370
// Dirty flag... well, let's just clear it. Shouldn't really matter for master tables
371
// (which this is, given something is merged to it)
377
public void release()
379
// If nothing has been added, nothing to do
383
if (_parent != null) {
384
_parent.mergeChild(this);
385
/* Let's also mark this instance as dirty, so that just in
386
* case release was too early, there's no corruption
387
* of possibly shared data.
394
/**********************************************************
395
/* Public API, generic accessors:
396
/**********************************************************
399
public int size() { return _size; }
402
* Method for checking number of primary hash buckets this symbol
407
public int bucketCount() {
408
return _symbols.length; }
410
public boolean maybeDirty() { return _dirty; }
412
public int hashSeed() { return _hashSeed; }
415
* Method mostly needed by unit tests; calculates number of
416
* entries that are in collision list. Value can be at most
417
* ({@link #size} - 1), but should usually be much lower, ideally 0.
421
public int collisionCount()
425
for (Bucket bucket : _buckets) {
426
if (bucket != null) {
427
count += bucket.length();
434
* Method mostly needed by unit tests; calculates length of the
435
* longest collision chain. This should typically be a low number,
436
* but may be up to {@link #size} - 1 in the pathological case
440
public int maxCollisionLength()
442
return _longestCollisionList;
446
/**********************************************************
447
/* Public API, accessing symbols:
448
/**********************************************************
451
public String findSymbol(char[] buffer, int start, int len, int h)
453
if (len < 1) { // empty Strings are simplest to handle up front
456
if (!_canonicalize) { // [JACKSON-259]
457
return new String(buffer, start, len);
460
/* Related to problems with sub-standard hashing (somewhat
461
* relevant for collision attacks too), let's try little
462
* bit of shuffling to improve hash codes.
463
* (note, however, that this can't help with full collisions)
465
int index = _hashToIndex(h);
466
String sym = _symbols[index];
468
// Optimal case; checking existing primary symbol for hash index:
470
// Let's inline primary String equality checking:
471
if (sym.length() == len) {
474
if (sym.charAt(i) != buffer[start+i]) {
478
// Optimal case; primary match found
483
// How about collision bucket?
484
Bucket b = _buckets[index >> 1];
486
sym = b.find(buffer, start, len);
493
if (!_dirty) { //need to do copy-on-write?
496
} else if (_size >= _sizeThreshold) { // Need to expand?
498
/* Need to recalc hash; rare occurence (index mask has been
499
* recalculated as part of rehash)
501
index = _hashToIndex(calcHash(buffer, start, len));
504
String newSymbol = new String(buffer, start, len);
506
newSymbol = InternCache.instance.intern(newSymbol);
509
// Ok; do we need to add primary entry, or a bucket?
510
if (_symbols[index] == null) {
511
_symbols[index] = newSymbol;
513
int bix = (index >> 1);
514
Bucket newB = new Bucket(newSymbol, _buckets[bix]);
515
_buckets[bix] = newB;
516
_longestCollisionList = Math.max(newB.length(), _longestCollisionList);
517
if (_longestCollisionList > MAX_COLL_CHAIN_LENGTH) {
518
reportTooManyCollisions(MAX_COLL_CHAIN_LENGTH);
526
* Helper method that takes in a "raw" hash value, shuffles it as necessary,
527
* and truncates to be used as the index.
529
public int _hashToIndex(int rawHash)
531
rawHash += (rawHash >>> 15); // this seems to help quite a bit, at least for our tests
532
return (rawHash & _indexMask);
536
* Implementation of a hashing method for variable length
537
* Strings. Most of the time intention is that this calculation
538
* is done by caller during parsing, not here; however, sometimes
539
* it needs to be done for parsed "String" too.
541
* @param len Length of String; has to be at least 1 (caller guarantees
542
* this pre-condition)
544
public int calcHash(char[] buffer, int start, int len)
546
int hash = _hashSeed;
547
for (int i = 0; i < len; ++i) {
548
hash = (hash * HASH_MULT) + (int) buffer[i];
550
// NOTE: shuffling, if any, is done in 'findSymbol()', not here:
551
return (hash == 0) ? 1 : hash;
554
public int calcHash(String key)
556
final int len = key.length();
558
int hash = _hashSeed;
559
for (int i = 0; i < len; ++i) {
560
hash = (hash * HASH_MULT) + (int) key.charAt(i);
562
// NOTE: shuffling, if any, is done in 'findSymbol()', not here:
563
return (hash == 0) ? 1 : hash;
567
/**********************************************************
569
/**********************************************************
573
* Method called when copy-on-write is needed; generally when first
574
* change is made to a derived symbol table.
576
private void copyArrays() {
577
String[] oldSyms = _symbols;
578
int size = oldSyms.length;
579
_symbols = new String[size];
580
System.arraycopy(oldSyms, 0, _symbols, 0, size);
581
Bucket[] oldBuckets = _buckets;
582
size = oldBuckets.length;
583
_buckets = new Bucket[size];
584
System.arraycopy(oldBuckets, 0, _buckets, 0, size);
588
* Method called when size (number of entries) of symbol table grows
589
* so big that load factor is exceeded. Since size has to remain
590
* power of two, arrays will then always be doubled. Main work
591
* is really redistributing old entries into new String/Bucket
594
private void rehash()
596
int size = _symbols.length;
597
int newSize = size + size;
599
/* 12-Mar-2010, tatu: Let's actually limit maximum size we are
600
* prepared to use, to guard against OOME in case of unbounded
601
* name sets (unique [non-repeating] names)
603
if (newSize > MAX_TABLE_SIZE) {
604
/* If this happens, there's no point in either growing or
605
* shrinking hash areas. Rather, it's better to just clean
609
Arrays.fill(_symbols, null);
610
Arrays.fill(_buckets, null);
615
String[] oldSyms = _symbols;
616
Bucket[] oldBuckets = _buckets;
617
_symbols = new String[newSize];
618
_buckets = new Bucket[newSize >> 1];
619
// Let's update index mask, threshold, now (needed for rehashing)
620
_indexMask = newSize - 1;
621
_sizeThreshold = _thresholdSize(newSize);
623
int count = 0; // let's do sanity check
625
/* Need to do two loops, unfortunately, since spill-over area is
626
* only half the size:
629
for (int i = 0; i < size; ++i) {
630
String symbol = oldSyms[i];
631
if (symbol != null) {
633
int index = _hashToIndex(calcHash(symbol));
634
if (_symbols[index] == null) {
635
_symbols[index] = symbol;
637
int bix = (index >> 1);
638
Bucket newB = new Bucket(symbol, _buckets[bix]);
639
_buckets[bix] = newB;
640
maxColl = Math.max(maxColl, newB.length());
646
for (int i = 0; i < size; ++i) {
647
Bucket b = oldBuckets[i];
650
String symbol = b.getSymbol();
651
int index = _hashToIndex(calcHash(symbol));
652
if (_symbols[index] == null) {
653
_symbols[index] = symbol;
655
int bix = (index >> 1);
656
Bucket newB = new Bucket(symbol, _buckets[bix]);
657
_buckets[bix] = newB;
658
maxColl = Math.max(maxColl, newB.length());
663
_longestCollisionList = maxColl;
665
if (count != _size) {
666
throw new Error("Internal error on SymbolTable.rehash(): had "+_size+" entries; now have "+count+".");
673
protected void reportTooManyCollisions(int maxLen)
675
throw new IllegalStateException("Longest collision chain in symbol table (of size "+_size
676
+") now exceeds maximum, "+maxLen+" -- suspect a DoS attack based on hash collisions");
680
/**********************************************************
682
/**********************************************************
686
* This class is a symbol table entry. Each entry acts as a node
689
static final class Bucket
691
private final String _symbol;
692
private final Bucket _next;
693
private final int _length;
695
public Bucket(String symbol, Bucket next) {
698
_length = (next == null) ? 1 : next._length+1;
701
public String getSymbol() { return _symbol; }
702
public Bucket getNext() { return _next; }
703
public int length() { return _length; }
705
public String find(char[] buf, int start, int len) {
706
String sym = _symbol;
709
while (true) { // Inlined equality comparison:
710
if (sym.length() == len) {
713
if (sym.charAt(i) != buf[start+i]) {