1
/*-------------------------------------------------------------------------
4
* Support functions for hash access method.
6
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
7
* Portions Copyright (c) 1994, Regents of the University of California
14
* These functions are stored in pg_amproc. For each operator class
15
* defined for hash indexes, they compute the hash value of the argument.
17
* Additional hash functions appear in /utils/adt/ files for various
18
* specialized datatypes.
20
* It is expected that every bit of a hash function's 32-bit result is
21
* as random as every other; failure to ensure this is likely to lead
22
* to poor performance of hash joins, for example. In most cases a hash
23
* function should use hash_any() or its variant hash_uint32().
24
*-------------------------------------------------------------------------
29
#include "access/hash.h"
32
/* Note: this is used for both "char" and boolean datatypes */
34
hashchar(PG_FUNCTION_ARGS)
36
return hash_uint32((int32) PG_GETARG_CHAR(0));
40
hashint2(PG_FUNCTION_ARGS)
42
return hash_uint32((int32) PG_GETARG_INT16(0));
46
hashint4(PG_FUNCTION_ARGS)
48
return hash_uint32(PG_GETARG_INT32(0));
52
hashint8(PG_FUNCTION_ARGS)
55
* The idea here is to produce a hash value compatible with the values
56
* produced by hashint4 and hashint2 for logically equal inputs; this is
57
* necessary to support cross-type hash joins across these input types.
58
* Since all three types are signed, we can xor the high half of the int8
59
* value if the sign is positive, or the complement of the high half when
60
* the sign is negative.
62
#ifndef INT64_IS_BUSTED
63
int64 val = PG_GETARG_INT64(0);
64
uint32 lohalf = (uint32) val;
65
uint32 hihalf = (uint32) (val >> 32);
67
lohalf ^= (val >= 0) ? hihalf : ~hihalf;
69
return hash_uint32(lohalf);
71
/* here if we can't count on "x >> 32" to work sanely */
72
return hash_uint32((int32) PG_GETARG_INT64(0));
77
hashoid(PG_FUNCTION_ARGS)
79
return hash_uint32((uint32) PG_GETARG_OID(0));
83
hashenum(PG_FUNCTION_ARGS)
85
return hash_uint32((uint32) PG_GETARG_OID(0));
89
hashfloat4(PG_FUNCTION_ARGS)
91
float4 key = PG_GETARG_FLOAT4(0);
95
* On IEEE-float machines, minus zero and zero have different bit patterns
96
* but should compare as equal. We must ensure that they have the same
97
* hash value, which is most reliably done this way:
99
if (key == (float4) 0)
103
* To support cross-type hashing of float8 and float4, we want to return
104
* the same hash value hashfloat8 would produce for an equal float8 value.
105
* So, widen the value to float8 and hash that. (We must do this rather
106
* than have hashfloat8 try to narrow its value to float4; that could fail
111
return hash_any((unsigned char *) &key8, sizeof(key8));
115
hashfloat8(PG_FUNCTION_ARGS)
117
float8 key = PG_GETARG_FLOAT8(0);
120
* On IEEE-float machines, minus zero and zero have different bit patterns
121
* but should compare as equal. We must ensure that they have the same
122
* hash value, which is most reliably done this way:
124
if (key == (float8) 0)
127
return hash_any((unsigned char *) &key, sizeof(key));
131
hashoidvector(PG_FUNCTION_ARGS)
133
oidvector *key = (oidvector *) PG_GETARG_POINTER(0);
135
return hash_any((unsigned char *) key->values, key->dim1 * sizeof(Oid));
139
hashint2vector(PG_FUNCTION_ARGS)
141
int2vector *key = (int2vector *) PG_GETARG_POINTER(0);
143
return hash_any((unsigned char *) key->values, key->dim1 * sizeof(int2));
147
hashname(PG_FUNCTION_ARGS)
149
char *key = NameStr(*PG_GETARG_NAME(0));
150
int keylen = strlen(key);
152
Assert(keylen < NAMEDATALEN); /* else it's not truncated correctly */
154
return hash_any((unsigned char *) key, keylen);
158
hashtext(PG_FUNCTION_ARGS)
160
text *key = PG_GETARG_TEXT_PP(0);
164
* Note: this is currently identical in behavior to hashvarlena, but keep
165
* it as a separate function in case we someday want to do something
166
* different in non-C locales. (See also hashbpchar, if so.)
168
result = hash_any((unsigned char *) VARDATA_ANY(key),
169
VARSIZE_ANY_EXHDR(key));
171
/* Avoid leaking memory for toasted inputs */
172
PG_FREE_IF_COPY(key, 0);
178
* hashvarlena() can be used for any varlena datatype in which there are
179
* no non-significant bits, ie, distinct bitpatterns never compare as equal.
182
hashvarlena(PG_FUNCTION_ARGS)
184
struct varlena *key = PG_GETARG_VARLENA_PP(0);
187
result = hash_any((unsigned char *) VARDATA_ANY(key),
188
VARSIZE_ANY_EXHDR(key));
190
/* Avoid leaking memory for toasted inputs */
191
PG_FREE_IF_COPY(key, 0);
197
* This hash function was written by Bob Jenkins
198
* (bob_jenkins@burtleburtle.net), and superficially adapted
199
* for PostgreSQL by Neil Conway. For more information on this
200
* hash function, see http://burtleburtle.net/bob/hash/doobs.html,
201
* or Bob's article in Dr. Dobb's Journal, Sept. 1997.
203
* In the current code, we have adopted Bob's 2006 update of his hash
204
* function to fetch the data a word at a time when it is suitably aligned.
205
* This makes for a useful speedup, at the cost of having to maintain
206
* four code paths (aligned vs unaligned, and little-endian vs big-endian).
207
* It also uses two separate mixing functions mix() and final(), instead
208
* of a slower multi-purpose function.
211
/* Get a bit mask of the bits set in non-uint32 aligned addresses */
212
#define UINT32_ALIGN_MASK (sizeof(uint32) - 1)
214
/* Rotate a uint32 value left by k bits - note multiple evaluation! */
215
#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
218
* mix -- mix 3 32-bit values reversibly.
220
* This is reversible, so any information in (a,b,c) before mix() is
221
* still in (a,b,c) after mix().
223
* If four pairs of (a,b,c) inputs are run through mix(), or through
224
* mix() in reverse, there are at least 32 bits of the output that
225
* are sometimes the same for one pair and different for another pair.
226
* This was tested for:
227
* * pairs that differed by one bit, by two bits, in any combination
228
* of top bits of (a,b,c), or in any combination of bottom bits of
230
* * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
231
* the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
232
* is commonly produced by subtraction) look like a single 1-bit
234
* * the base values were pseudorandom, all zero but one bit set, or
235
* all zero plus a counter that starts at zero.
237
* This does not achieve avalanche. There are input bits of (a,b,c)
238
* that fail to affect some output bits of (a,b,c), especially of a. The
239
* most thoroughly mixed value is c, but it doesn't really even achieve
242
* This allows some parallelism. Read-after-writes are good at doubling
243
* the number of bits affected, so the goal of mixing pulls in the opposite
244
* direction from the goal of parallelism. I did what I could. Rotates
245
* seem to cost as much as shifts on every machine I could lay my hands on,
246
* and rotates are much kinder to the top and bottom bits, so I used rotates.
251
a -= c; a ^= rot(c, 4); c += b; \
252
b -= a; b ^= rot(a, 6); a += c; \
253
c -= b; c ^= rot(b, 8); b += a; \
254
a -= c; a ^= rot(c,16); c += b; \
255
b -= a; b ^= rot(a,19); a += c; \
256
c -= b; c ^= rot(b, 4); b += a; \
260
* final -- final mixing of 3 32-bit values (a,b,c) into c
262
* Pairs of (a,b,c) values differing in only a few bits will usually
263
* produce values of c that look totally different. This was tested for
264
* * pairs that differed by one bit, by two bits, in any combination
265
* of top bits of (a,b,c), or in any combination of bottom bits of
267
* * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
268
* the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
269
* is commonly produced by subtraction) look like a single 1-bit
271
* * the base values were pseudorandom, all zero but one bit set, or
272
* all zero plus a counter that starts at zero.
274
* The use of separate functions for mix() and final() allow for a
275
* substantial performance increase since final() does not need to
276
* do well in reverse, but is does need to affect all output bits.
277
* mix(), on the other hand, does not need to affect all output
278
* bits (affecting 32 bits is enough). The original hash function had
279
* a single mixing operation that had to satisfy both sets of requirements
280
* and was slower as a result.
283
#define final(a,b,c) \
285
c ^= b; c -= rot(b,14); \
286
a ^= c; a -= rot(c,11); \
287
b ^= a; b -= rot(a,25); \
288
c ^= b; c -= rot(b,16); \
289
a ^= c; a -= rot(c, 4); \
290
b ^= a; b -= rot(a,14); \
291
c ^= b; c -= rot(b,24); \
295
* hash_any() -- hash a variable-length key into a 32-bit value
296
* k : the key (the unaligned variable-length array of bytes)
297
* len : the length of the key, counting by bytes
299
* Returns a uint32 value. Every bit of the key affects every bit of
300
* the return value. Every 1-bit and 2-bit delta achieves avalanche.
301
* About 6*len+35 instructions. The best hash table sizes are powers
302
* of 2. There is no need to do mod a prime (mod is sooo slow!).
303
* If you need less than 32 bits, use a bitmask.
305
* Note: we could easily change this function to return a 64-bit hash value
306
* by using the final values of both b and c. b is perhaps a little less
307
* well mixed than c, however.
310
hash_any(register const unsigned char *k, register int keylen)
317
/* Set up the internal state */
319
a = b = c = 0x9e3779b9 + len + 3923095;
321
/* If the source pointer is word-aligned, we use word-wide fetches */
322
if (((long) k & UINT32_ALIGN_MASK) == 0)
324
/* Code path for aligned source data */
325
register const uint32 *ka = (const uint32 *) k;
327
/* handle most of the key */
338
/* handle the last 11 bytes */
339
k = (const unsigned char *) ka;
340
#ifdef WORDS_BIGENDIAN
344
c += ((uint32) k[10] << 8);
347
c += ((uint32) k[9] << 16);
350
c += ((uint32) k[8] << 24);
351
/* the lowest byte of c is reserved for the length */
358
b += ((uint32) k[6] << 8);
361
b += ((uint32) k[5] << 16);
364
b += ((uint32) k[4] << 24);
370
a += ((uint32) k[2] << 8);
373
a += ((uint32) k[1] << 16);
376
a += ((uint32) k[0] << 24);
377
/* case 0: nothing left to add */
379
#else /* !WORDS_BIGENDIAN */
383
c += ((uint32) k[10] << 24);
386
c += ((uint32) k[9] << 16);
389
c += ((uint32) k[8] << 8);
390
/* the lowest byte of c is reserved for the length */
397
b += ((uint32) k[6] << 16);
400
b += ((uint32) k[5] << 8);
409
a += ((uint32) k[2] << 16);
412
a += ((uint32) k[1] << 8);
416
/* case 0: nothing left to add */
418
#endif /* WORDS_BIGENDIAN */
422
/* Code path for non-aligned source data */
424
/* handle most of the key */
427
#ifdef WORDS_BIGENDIAN
428
a += (k[3] + ((uint32) k[2] << 8) + ((uint32) k[1] << 16) + ((uint32) k[0] << 24));
429
b += (k[7] + ((uint32) k[6] << 8) + ((uint32) k[5] << 16) + ((uint32) k[4] << 24));
430
c += (k[11] + ((uint32) k[10] << 8) + ((uint32) k[9] << 16) + ((uint32) k[8] << 24));
431
#else /* !WORDS_BIGENDIAN */
432
a += (k[0] + ((uint32) k[1] << 8) + ((uint32) k[2] << 16) + ((uint32) k[3] << 24));
433
b += (k[4] + ((uint32) k[5] << 8) + ((uint32) k[6] << 16) + ((uint32) k[7] << 24));
434
c += (k[8] + ((uint32) k[9] << 8) + ((uint32) k[10] << 16) + ((uint32) k[11] << 24));
435
#endif /* WORDS_BIGENDIAN */
441
/* handle the last 11 bytes */
442
#ifdef WORDS_BIGENDIAN
443
switch (len) /* all the case statements fall through */
446
c += ((uint32) k[10] << 8);
448
c += ((uint32) k[9] << 16);
450
c += ((uint32) k[8] << 24);
451
/* the lowest byte of c is reserved for the length */
455
b += ((uint32) k[6] << 8);
457
b += ((uint32) k[5] << 16);
459
b += ((uint32) k[4] << 24);
463
a += ((uint32) k[2] << 8);
465
a += ((uint32) k[1] << 16);
467
a += ((uint32) k[0] << 24);
468
/* case 0: nothing left to add */
470
#else /* !WORDS_BIGENDIAN */
471
switch (len) /* all the case statements fall through */
474
c += ((uint32) k[10] << 24);
476
c += ((uint32) k[9] << 16);
478
c += ((uint32) k[8] << 8);
479
/* the lowest byte of c is reserved for the length */
481
b += ((uint32) k[7] << 24);
483
b += ((uint32) k[6] << 16);
485
b += ((uint32) k[5] << 8);
489
a += ((uint32) k[3] << 24);
491
a += ((uint32) k[2] << 16);
493
a += ((uint32) k[1] << 8);
496
/* case 0: nothing left to add */
498
#endif /* WORDS_BIGENDIAN */
503
/* report the result */
504
return UInt32GetDatum(c);
508
* hash_uint32() -- hash a 32-bit value
510
* This has the same result as
511
* hash_any(&k, sizeof(uint32))
512
* but is faster and doesn't force the caller to store k into memory.
515
hash_uint32(uint32 k)
521
a = b = c = 0x9e3779b9 + (uint32) sizeof(uint32) + 3923095;
526
/* report the result */
527
return UInt32GetDatum(c);