2
* Integer hashing tests. These functions work with 32-bit integers, so are
3
* perfectly suited for IPv4 addresses. A few tests show that they may also
4
* be chained for larger keys (eg: IPv6), this way :
5
* f(x[0-3]) = f(f(f(f(x[0])^x[1])^x[2])^x[3])
7
* See also bob jenkin's site for more info on hashing, and check perfect
8
* hashing for constants (eg: header names).
13
#include <arpa/inet.h>
20
int counts_id[NSERV][NSERV];
21
uint32_t hash_id( uint32_t a)
26
/* Full-avalanche integer hashing function from Thomas Wang, suitable for use
27
* with a modulo. See below, worth a read !
28
* http://www.concentric.net/~Ttwang/tech/inthash.htm
30
* See also tests performed by Bob Jenkins (says it's faster than his) :
31
* http://burtleburtle.net/bob/hash/integer.html
33
* This function is small and fast. It does not seem as smooth as bj6 though.
34
* About 0x40 bytes, 6 shifts.
36
int counts_tw1[NSERV][NSERV];
37
uint32_t hash_tw1(uint32_t a)
48
/* Thomas Wang's mix function. The multiply is optimized away by the compiler
50
* It is about equivalent to the one above.
52
int counts_tw2[NSERV][NSERV];
53
uint32_t hash_tw2(uint32_t a)
64
/* Thomas Wang's multiplicative hash function. About 0x30 bytes, and it is
65
* extremely fast on recent processors with a fast multiply. However, it
66
* must not be used on low bits only, as multiples of 0x00100010 only return
69
int counts_tw3[NSERV][NSERV];
70
uint32_t hash_tw3(uint32_t a)
72
a = (a ^ 61) ^ (a >> 16);
81
/* Full-avalanche integer hashing function from Bob Jenkins, suitable for use
82
* with a modulo. It has a very smooth distribution.
83
* http://burtleburtle.net/bob/hash/integer.html
84
* About 0x50 bytes, 6 shifts.
86
int counts_bj6[NSERV][NSERV];
87
int counts_bj6x[NSERV][NSERV];
88
uint32_t hash_bj6(uint32_t a)
90
a = (a+0x7ed55d16) + (a<<12);
91
a = (a^0xc761c23c) ^ (a>>19);
92
a = (a+0x165667b1) + (a<<5);
93
a = (a+0xd3a2646c) ^ (a<<9);
94
a = (a+0xfd7046c5) + (a<<3);
95
a = (a^0xb55a4f09) ^ (a>>16);
99
/* Similar function with one more shift and no magic number. It is slightly
100
* slower but provides the overall smoothest distribution.
101
* About 0x40 bytes, 7 shifts.
103
int counts_bj7[NSERV][NSERV];
104
int counts_bj7x[NSERV][NSERV];
105
uint32_t hash_bj7(uint32_t a)
118
void count_hash_results(unsigned long hash, int counts[NSERV][NSERV]) {
121
for (nsrv = 0; nsrv < NSERV; nsrv++) {
122
srv = hash % (nsrv + 1);
127
void dump_hash_results(char *name, int counts[NSERV][NSERV]) {
129
double err, total_err, max_err;
131
printf("%s:\n", name);
132
for (nsrv = 0; nsrv < NSERV; nsrv++) {
135
printf("%02d srv: ", nsrv+1);
136
for (srv = 0; srv <= nsrv; srv++) {
137
err = 100.0*(counts[nsrv][srv] - (double)counts[0][0]/(nsrv+1)) / (double)counts[0][0];
138
//printf("%6d ", counts[nsrv][srv]);
139
printf("% 3.1f%%%c ", err,
140
counts[nsrv][srv]?' ':'*'); /* display '*' when a server is never selected */
146
total_err /= (double)(nsrv+1);
147
for (srv = nsrv+1; srv < NSERV; srv++)
149
printf(" avg_err=%3.1f, max_err=%3.1f\n", total_err, max_err);
156
unsigned int address = 0;
157
unsigned int mask = ~0;
159
memset(counts_id, 0, sizeof(counts_id));
160
memset(counts_tw1, 0, sizeof(counts_tw1));
161
memset(counts_tw2, 0, sizeof(counts_tw2));
162
memset(counts_tw3, 0, sizeof(counts_tw3));
163
memset(counts_bj6, 0, sizeof(counts_bj6));
164
memset(counts_bj7, 0, sizeof(counts_bj7));
166
address = 0x10000000;
167
mask = 0xffffff00; // user mask to apply to addresses
168
for (nr = 0; nr < 0x10; nr++) {
169
//address += ~nr; // semi-random addresses.
171
address += 0x00000100;
172
//address += 0x11111111;
178
//address += 0x00100010; // this increment kills tw3 !
179
count_hash_results(hash_id (address & mask), counts_id); // 0.69s / 100M
180
count_hash_results(hash_tw1(address & mask), counts_tw1); // 1.04s / 100M
181
count_hash_results(hash_tw2(address & mask), counts_tw2); // 1.13s / 100M
182
count_hash_results(hash_tw3(address & mask), counts_tw3); // 1.01s / 100M
183
count_hash_results(hash_bj6(address & mask), counts_bj6); // 1.07s / 100M
184
count_hash_results(hash_bj7(address & mask), counts_bj7); // 1.20s / 100M
185
/* adding the original address after the hash reduces the error
186
* rate in in presence of very small data sets (eg: 16 source
187
* addresses for 8 servers). In this case, bj7 is very good.
189
count_hash_results(hash_bj6(address & mask)+(address&mask), counts_bj6x); // 1.07s / 100M
190
count_hash_results(hash_bj7(address & mask)+(address&mask), counts_bj7x); // 1.20s / 100M
193
dump_hash_results("hash_id", counts_id);
194
dump_hash_results("hash_tw1", counts_tw1);
195
dump_hash_results("hash_tw2", counts_tw2);
196
dump_hash_results("hash_tw3", counts_tw3);
197
dump_hash_results("hash_bj6", counts_bj6);
198
dump_hash_results("hash_bj6x", counts_bj6x);
199
dump_hash_results("hash_bj7", counts_bj7);
200
dump_hash_results("hash_bj7x", counts_bj7x);