~ubuntu-branches/ubuntu/precise/wget/precise-proposed

Viewing changes to src/hash.c

Committer: Bazaar Package Importer
Author(s): Noèl Köthe
Date: 2005-06-26 16:46:25 UTC
mfrom: (1.1.1 upstream) (2.1.1 sarge)
Revision ID: james.westby@ubuntu.com-20050626164625-jjcde8hyztx7xq7o

Tags: 1.10-2

http://bugs.debian.org/314728

http://bugs.debian.org/163243

http://bugs.debian.org/313883

* wget-fix_error--save-headers patch from upstream
  (closes: Bug#314728)
* don't pattern-match server redirects patch from upstream
  (closes: Bug#163243)
* correct de.po typos
  (closes: Bug#313883)
* wget-E_html_behind_file_counting fix problem with adding the
  numbers after the html extension
* updated Standards-Version: to 3.6.2

files added:
ChangeLog-branches/1.8_branch.ChangeLog

ChangeLog-branches/1.9_branch.ChangeLog

debian/patches

debian/patches/00list

debian/patches/00template

debian/patches/wget-E_html_behind_file_counting

debian/patches/wget-de.po-spelling-correction

debian/patches/wget-doc-remove-usr-local-in-sample.wgetrc

debian/patches/wget-doc-remove-usr-local-in-wget.texi

debian/patches/wget-dont_pattern_match_server_redirects

debian/patches/wget-fix_error--save-headers

debian/patches/wget-fr.po-spelling-correction

debian/patches/wget-passive_ftp-default

doc/ChangeLog-branches/1.8_branch.ChangeLog

doc/ChangeLog-branches/1.9_branch.ChangeLog

doc/fdl.texi

po/en_GB.po

po/eo.po

po/eu.po

po/fi.po

po/ga.po

po/sr.po

po/vi.po

src/ChangeLog-branches/1.9_branch.ChangeLog

src/config-post.h

src/http-ntlm.c

src/http-ntlm.h

src/log.h

src/openssl.c

src/ptimer.c

src/ptimer.h

src/ssl.h

src/xmalloc.c

src/xmalloc.h

windows/ChangeLog

windows/Makefile.src.mingw

windows/Makefile.top.mingw

windows/config.h.mingw

files removed:
MACHINES

doc/wget.info-1

doc/wget.info-2

doc/wget.info-3

doc/wget.info-4

po/bg.gmo

po/ca.gmo

po/cs.gmo

po/da.gmo

po/de.gmo

po/el.gmo

po/es.gmo

po/et.gmo

po/fr.gmo

po/gl.gmo

po/he.gmo

po/hr.gmo

po/hu.gmo

po/it.gmo

po/ja.gmo

po/nl.gmo

po/no.gmo

po/pl.gmo

po/pt_BR.gmo

po/ro.gmo

po/ru.gmo

po/sk.gmo

po/sl.gmo

po/sv.gmo

po/tr.gmo

po/uk.gmo

po/wget.pot

po/zh_CN.gmo

po/zh_TW.gmo

src/gen_sslfunc.c

src/gen_sslfunc.h

src/headers.c

src/headers.h

src/rbuf.c

src/rbuf.h

files modified:
AUTHORS

ChangeLog

INSTALL

MAILING-LIST

Makefile.cvs

Makefile.in

NEWS

README

TODO

aclocal.m4

config.guess

config.sub

configure

configure.bat

configure.bat.in

configure.in

debian/changelog

debian/control

debian/rules

doc/ChangeLog

doc/Makefile.in

doc/sample.wgetrc

doc/sample.wgetrc.munged_for_texi_inclusion

doc/texi2pod.pl.in

doc/version.texi

doc/wget.info

doc/wget.texi

libtool.m4

ltmain.sh

po/Makefile.in.in

po/POTFILES.in

po/bg.po

po/ca.po

po/cs.po

po/da.po

po/de.po

po/el.po

po/es.po

po/et.po

po/fr.po

po/gl.po

po/he.po

po/hr.po

po/hu.po

po/it.po

po/ja.po

po/nl.po

po/no.po

po/pl.po

po/pt_BR.po

po/ro.po

po/ru.po

po/sk.po

po/sl.po

po/sv.po

po/tr.po

po/uk.po

po/zh_CN.po

po/zh_TW.po

src/ChangeLog

src/Makefile.in

src/alloca.c

src/cmpt.c

src/config.h.in

src/connect.c

src/connect.h

src/convert.c

src/convert.h

src/cookies.c

src/cookies.h

src/ftp-basic.c

src/ftp-ls.c

src/ftp-opie.c

src/ftp.c

src/ftp.h

src/gen-md5.c

src/gen-md5.h

src/getopt.h

src/gnu-md5.h

src/hash.c

src/hash.h

src/host.c

src/host.h

src/html-parse.c

src/html-url.c

src/http.c

src/init.c

src/init.h

src/log.c

src/main.c

src/mswindows.c

src/mswindows.h

src/netrc.c

src/options.h

src/progress.c

src/progress.h

src/recur.c

src/recur.h

src/res.c

src/retr.c

src/retr.h

src/snprintf.c

src/sysdep.h

src/url.c

src/utils.c

src/utils.h

src/version.c

src/wget.h

util/dist-wget

windows/Makefile.doc

windows/Makefile.src

windows/Makefile.src.bor

windows/Makefile.top

windows/Makefile.top.bor

windows/Makefile.watcom

windows/README

windows/config.h.bor

windows/config.h.ms

windows/wget.dep

Show diffs side-by-side

added added

removed removed

src/hash.c

file, but you are not obligated to do so. If you do not wish to do

so, delete this exception statement from your version. */

/* With -DSTANDALONE, this file can be compiled outside Wget source

tree. To test, also use -DTEST. */

#ifdef HAVE_CONFIG_H

# include <config.h>

#endif

#ifdef HAVE_STRING_H

# ifdef HAVE_STRING_H

# include <string.h>

# else

# include <strings.h>

# endif

# ifdef HAVE_LIMITS_H

# include <limits.h>

# endif

#else

/* If running without Autoconf, go ahead and assume presence of

standard C89 headers. */

# include <string.h>

#else

# include <strings.h>

#endif /* HAVE_STRING_H */

# include <limits.h>

#endif

#include <stdio.h>

#include <stdlib.h>

#include <assert.h>

#include "wget.h"

#include "utils.h"

#include "hash.h"

#ifdef STANDALONE

# undef xmalloc

# undef xrealloc

# undef xfree

# define xmalloc malloc

# define xrealloc realloc

#ifndef STANDALONE

/* Get Wget's utility headers. */

# include "wget.h"

# include "utils.h"

#else

/* Make do without them. */

# define xnew(x) xmalloc (sizeof (x))

# define xnew_array(type, x) xmalloc (sizeof (type) * (x))

# define xmalloc malloc /* or something that exits

if not enough memory */

# define xfree free

# undef TOLOWER

# define countof(x) (sizeof (x) / sizeof ((x)[0]))

# define TOLOWER(x) ('A' <= (x) && (x) <= 'Z' ? (x) - 32 : (x))

# define PARAMS(x) x

#endif

#include "hash.h"

/* INTERFACE:

Hash tables are an implementation technique used to implement

mapping between objects. Assuming a good hashing function is used,

they provide near-constant-time access and storing of information.

Duplicate keys are not allowed.

This file defines the following entry points: hash_table_new

creates a hash table, and hash_table_destroy deletes it.

hash_table_put establishes a mapping between a key and a value.

hash_table_get retrieves the value that corresponds to a key.

hash_table_contains queries whether a key is stored in a table at

all. hash_table_remove removes a mapping that corresponds to a

key. hash_table_map allows you to map through all the entries in a

hash table. hash_table_clear clears all the entries from the hash

table.

The number of mappings in a table is not limited, except by the

amount of memory. As you add new elements to a table, it regrows

as necessary. If you have an idea about how many elements you will

store, you can provide a hint to hash_table_new().

The hashing and equality functions depend on the type of key and

are normally provided by the user. For the special (and frequent)

case of using string keys, you can use the pre-canned

make_string_hash_table(), which provides an efficient string

hashing function, and a string equality wrapper around strcmp().

When specifying your hash and test functions, make sure the

following holds true:

- The test function returns non-zero for keys that are considered

"equal", zero otherwise.

- The hash function returns a number that represents the

"distinctness" of the object. In more precise terms, it means

that for any two objects that test "equal" under the test

function, the hash function MUST produce the same result.

This does not mean that each distinct object must produce a

100

distinct value, only that non-distinct objects must produce the

101

same values! For instance, a hash function that returns 0 for

102

any given object is a perfectly valid (albeit extremely bad) hash

103

function. A hash function that hashes a string by adding up all

104

its characters is another example of a valid (but quite bad) hash

105

function.

106

107

The above stated rule is quite easy to enforce. For example, if

108

your testing function compares strings case-insensitively, all

109

your function needs to do is lower-case the string characters

110

before calculating a hash. That way you have easily guaranteed

111

that case differences will not result in a different hash.

112

113

- If you care about performance, choose a hash function with as

114

good "spreading" as possible. A good hash function will react to

115

even a small change in its input with a completely different

116

resulting hash. Finally, don't make the hash function itself

117

overly slow, because you'll be incurring a non-negligible

118

overhead to reads and writes to the hash table.

Hash tables are a technique used to implement mapping between

objects with near-constant-time access and storage. The table

associates keys to values, and a value can be very quickly

retrieved by providing the key. Fast lookup tables are typically

implemented as hash tables.

The entry points are

hash_table_new -- creates the table.

hash_table_destroy -- destroys the table.

hash_table_put -- establishes or updates key->value mapping.

hash_table_get -- retrieves value of key.

hash_table_get_pair -- get key/value pair for key.

hash_table_contains -- test whether the table contains key.

hash_table_remove -- remove the key->value mapping for key.

hash_table_map -- iterate through table mappings.

hash_table_clear -- clear hash table contents.

hash_table_count -- return the number of entries in the table.

The hash table grows internally as new entries are added and is not

limited in size, except by available memory. The table doubles

with each resize, which ensures that the amortized time per

operation remains constant.

By default, tables created by hash_table_new consider the keys to

be equal if their pointer values are the same. You can use

make_string_hash_table to create tables whose keys are considered

100

equal if their string contents are the same. In the general case,

101

the criterion of equality used to compare keys is specified at

102

table creation time with two callback functions, "hash" and "test".

103

The hash function transforms the key into an arbitrary number that

104

must be the same for two equal keys. The test function accepts two

105

keys and returns non-zero if they are to be considered equal.

119

106

120

107

Note that neither keys nor values are copied when inserted into the

121

108

hash table, so they must exist for the lifetime of the table. This

125

112

126

113

/* IMPLEMENTATION:

127

114

128

All the hash mappings (key-value pairs of pointers) are stored in a

129

contiguous array. The position of each mapping is determined by

130

the hash value of its key and the size of the table: location :=

131

hash(key) % size. If two different keys end up on the same

132

position (hash collision), the one that came second is placed at

133

the next empty position following the occupied place. This

134

collision resolution technique is called "linear probing".

135

136

There are more advanced collision resolution mechanisms (quadratic

115

The hash table is implemented as an open-addressed table with

116

linear probing collision resolution.

117

118

The above means that all the hash entries (pairs of pointers, key

119

and value) are stored in a contiguous array. The position of each

120

mapping is determined by the hash value of its key and the size of

121

the table: location := hash(key) % size. If two different keys end

122

up on the same position (collide), the one that came second is

123

placed at the next empty position following the occupied place.

124

This collision resolution technique is called "linear probing".

125

126

There are more advanced collision resolution methods (quadratic

137

127

probing, double hashing), but we don't use them because they incur

138

more non-sequential access to the array, which results in worse

128

more non-sequential access to the array, which results in worse CPU

139

129

cache behavior. Linear probing works well as long as the

140

fullness/size ratio is kept below 75%. We make sure to regrow or

141

rehash the hash table whenever this threshold is exceeded.

142

143

Collisions make deletion tricky because finding collisions again

144

relies on new empty spots not being created. That's why

145

hash_table_remove is careful to rehash the mappings that follow the

146

deleted one. */

147

148

/* When hash table's fullness exceeds this threshold, the hash table

149

is resized. */

150

#define HASH_FULLNESS_THRESHOLD 0.75

151

152

/* The hash table size is multiplied by this factor with each resize.

153

This guarantees infrequent resizes. */

130

count/size ratio (fullness) is kept below 75%. We make sure to

131

grow and rehash the table whenever this threshold is exceeded.

132

133

Collisions make deletion tricky because clearing a position

134

followed by a colliding entry would make the position seem empty

135

and the colliding entry not found. One solution is to leave a

136

"tombstone" instead of clearing the entry, and another is to

137

carefully rehash the entries immediately following the deleted one.

138

We use the latter method because it results in less bookkeeping and

139

faster retrieval at the (slight) expense of deletion. */

140

141

/* Maximum allowed fullness: when hash table's fullness exceeds this

142

value, the table is resized. */

143

#define HASH_MAX_FULLNESS 0.75

144

145

/* The hash table size is multiplied by this factor (and then rounded

146

to the next prime) with each resize. This guarantees infrequent

147

resizes. */

154

148

#define HASH_RESIZE_FACTOR 2

155

149

156

150

struct mapping {

158

152

void *value;

159

153

};

160

154

155

typedef unsigned long (*hashfun_t) PARAMS ((const void *));

156

typedef int (*testfun_t) PARAMS ((const void *, const void *));

157

161

158

struct hash_table {

162

unsigned long (*hash_function) PARAMS ((const void *));

163

int (*test_function) PARAMS ((const void *, const void *));

164

165

int size; /* size of the array */

166

int count; /* number of non-empty, non-deleted

167

fields. */

168

159

hashfun_t hash_function;

160

testfun_t test_function;

161

162

struct mapping *mappings; /* pointer to the table entries. */

163

int size; /* size of the array. */

164

165

int count; /* number of non-empty entries. */

169

166

int resize_threshold; /* after size exceeds this number of

170

167

entries, resize the table. */

171

168

int prime_offset; /* the offset of the current prime in

172

169

the prime table. */

173

174

struct mapping *mappings; /* the array of mapping pairs. */

175

170

};

176

171

177

/* We use NULL key to mark a mapping as empty. It is consequently

178

illegal to store NULL keys. */

179

#define NON_EMPTY(mp) (mp->key != NULL)

172

/* We use the all-bits-set constant (INVALID_PTR) marker to mean that

173

a mapping is empty. It is unaligned and therefore illegal as a

174

pointer. INVALID_PTR_BYTE (0xff) is the one-byte value used to

175

initialize the mappings array as empty.

176

177

The all-bits-set value is a better choice than NULL because it

178

allows the use of NULL/0 keys. Since the keys are either integers

179

or pointers, the only key that cannot be used is the integer value

180

-1. This is acceptable because it still allows the use of

181

nonnegative integer keys. */

182

183

#define INVALID_PTR ((void *) ~(unsigned long)0)

184

#ifndef UCHAR_MAX

185

# define UCHAR_MAX 0xff

186

#endif

187

#define INVALID_PTR_BYTE UCHAR_MAX

188

189

#define NON_EMPTY(mp) ((mp)->key != INVALID_PTR)

190

#define MARK_AS_EMPTY(mp) ((mp)->key = INVALID_PTR)

180

191

181

192

/* "Next" mapping is the mapping after MP, but wrapping back to

182

193

MAPPINGS when MP would reach MAPPINGS+SIZE. */

187

198

#define LOOP_NON_EMPTY(mp, mappings, size) \

188

199

for (; NON_EMPTY (mp); mp = NEXT_MAPPING (mp, mappings, size))

189

200

190

/* #### We might want to multiply with the "golden ratio" here to get

191

better randomness for keys that do not result from a good hash

192

function. This is currently not a problem in Wget because we only

193

use the string hash tables. */

194

195

#define HASH_POSITION(ht, key) (ht->hash_function (key) % ht->size)

196

197

/* Find a prime near, but greather than or equal to SIZE. Of course,

198

the primes are not calculated, but looked up from a table. The

199

table does not contain all primes in range, just a selection useful

201

/* Return the position of KEY in hash table SIZE large, hash function

202

being HASHFUN. */

203

#define HASH_POSITION(key, hashfun, size) ((hashfun) (key) % size)

204

205

/* Find a prime near, but greather than or equal to SIZE. The primes

206

are looked up from a table with a selection of primes convenient

200

207

for this purpose.

201

208

202

PRIME_OFFSET is a minor optimization: if specified, it starts the

203

search for the prime number beginning with the specific offset in

204

the prime number table. The final offset is stored in the same

205

variable. */

209

PRIME_OFFSET is a minor optimization: it specifies start position

210

for the search for the large enough prime. The final offset is

211

stored in the same variable. That way the list of primes does not

212

have to be scanned from the beginning each time around. */

206

213

207

214

static int

208

215

prime_size (int size, int *prime_offset)

209

216

{

210

static const unsigned long primes [] = {

217

static const int primes[] = {

211

218

13, 19, 29, 41, 59, 79, 107, 149, 197, 263, 347, 457, 599, 787, 1031,

212

219

1361, 1777, 2333, 3037, 3967, 5167, 6719, 8737, 11369, 14783,

213

220

19219, 24989, 32491, 42257, 54941, 71429, 92861, 120721, 156941,

216

223

10445899, 13579681, 17653589, 22949669, 29834603, 38784989,

217

224

50420551, 65546729, 85210757, 110774011, 144006217, 187208107,

218

225

243370577, 316381771, 411296309, 534685237, 695090819, 903618083,

219

1174703521, 1527114613, 1985248999,

220

(unsigned long)0x99d43ea5, (unsigned long)0xc7fa5177

226

1174703521, 1527114613, 1837299131, 2147483647

221

227

};

222

int i = *prime_offset;

228

int i;

223

229

224

for (; i < countof (primes); i++)

230

for (i = *prime_offset; i < countof (primes); i++)

225

231

if (primes[i] >= size)

226

232

{

227

233

/* Set the offset to the next prime. That is safe because,

234

240

}

235

241

236

242

abort ();

237

return 0;

238

243

}

239

244

245

static int cmp_pointer PARAMS ((const void *, const void *));

246

240

247

/* Create a hash table with hash function HASH_FUNCTION and test

241

248

function TEST_FUNCTION. The table is empty (its count is 0), but

242

249

pre-allocated to store at least ITEMS items.

253

260

used as size unchanged. To start with a small table that grows as

254

261

needed, simply specify zero ITEMS.

255

262

256

If HASH_FUNCTION is not provided, identity table is assumed,

257

i.e. key pointers are compared as keys. If you want strings with

258

equal contents to hash the same, use make_string_hash_table. */

263

If hash and test callbacks are not specified, identity mapping is

264

assumed, i.e. pointer values are used for key comparison. (Common

265

Lisp calls such tables EQ hash tables, and Java calls them

266

IdentityHashMaps.) If your keys require different comparison,

267

specify hash and test functions. For easy use of C strings as hash

268

keys, you can use the convenience functions make_string_hash_table

269

and make_nocase_string_hash_table. */

259

270

260

271

struct hash_table *

261

272

hash_table_new (int items,

263

274

int (*test_function) (const void *, const void *))

264

275

{

265

276

int size;

266

struct hash_table *ht

267

= (struct hash_table *)xmalloc (sizeof (struct hash_table));

268

269

ht->hash_function = hash_function ? hash_function : ptrhash;

270

ht->test_function = test_function ? test_function : ptrcmp;

271

277

struct hash_table *ht = xnew (struct hash_table);

278

279

ht->hash_function = hash_function ? hash_function : hash_pointer;

280

ht->test_function = test_function ? test_function : cmp_pointer;

281

282

/* If the size of struct hash_table ever becomes a concern, this

283

field can go. (Wget doesn't create many hashes.) */

272

284

ht->prime_offset = 0;

273

285

274

286

/* Calculate the size that ensures that the table will store at

275

287

least ITEMS keys without the need to resize. */

276

size = 1 + items / HASH_FULLNESS_THRESHOLD;

288

size = 1 + items / HASH_MAX_FULLNESS;

277

289

size = prime_size (size, &ht->prime_offset);

278

290

ht->size = size;

279

ht->resize_threshold = size * HASH_FULLNESS_THRESHOLD;

291

ht->resize_threshold = size * HASH_MAX_FULLNESS;

280

292

/*assert (ht->resize_threshold >= items);*/

281

293

282

ht->mappings = xmalloc (ht->size * sizeof (struct mapping));

283

memset (ht->mappings, '\0', ht->size * sizeof (struct mapping));

294

ht->mappings = xnew_array (struct mapping, ht->size);

295

296

/* Mark mappings as empty. We use 0xff rather than 0 to mark empty

297

keys because it allows us to use NULL/0 as keys. */

298

memset (ht->mappings, INVALID_PTR_BYTE, size * sizeof (struct mapping));

284

299

285

300

ht->count = 0;

286

301

305

320

{

306

321

struct mapping *mappings = ht->mappings;

307

322

int size = ht->size;

308

struct mapping *mp = mappings + HASH_POSITION (ht, key);

309

int (*equals) PARAMS ((const void *, const void *)) = ht->test_function;

323

struct mapping *mp = mappings + HASH_POSITION (key, ht->hash_function, size);

324

testfun_t equals = ht->test_function;

310

325

311

326

LOOP_NON_EMPTY (mp, mappings, size)

312

327

if (equals (key, mp->key))

366

381

static void

367

382

grow_hash_table (struct hash_table *ht)

368

383

{

384

hashfun_t hasher = ht->hash_function;

369

385

struct mapping *old_mappings = ht->mappings;

370

386

struct mapping *old_end = ht->mappings + ht->size;

371

387

struct mapping *mp, *mappings;

380

396

#endif

381

397

382

398

ht->size = newsize;

383

ht->resize_threshold = newsize * HASH_FULLNESS_THRESHOLD;

399

ht->resize_threshold = newsize * HASH_MAX_FULLNESS;

384

400

385

mappings = xmalloc (ht->size * sizeof (struct mapping));

386

memset (mappings, '\0', ht->size * sizeof (struct mapping));

401

mappings = xnew_array (struct mapping, newsize);

402

memset (mappings, INVALID_PTR_BYTE, newsize * sizeof (struct mapping));

387

403

ht->mappings = mappings;

388

404

389

405

for (mp = old_mappings; mp < old_end; mp++)

390

406

if (NON_EMPTY (mp))

391

407

{

392

struct mapping *new_mp = mappings + HASH_POSITION (ht, mp->key);

393

/* We don't need to test for uniqueness of keys because all

394

the keys come from the hash table and are therefore known

395

to be unique. */

408

struct mapping *new_mp;

409

/* We don't need to test for uniqueness of keys because they

410

come from the hash table and are therefore known to be

411

unique. */

412

new_mp = mappings + HASH_POSITION (mp->key, hasher, newsize);

396

413

LOOP_NON_EMPTY (new_mp, mappings, newsize)

397

414

;

398

415

*new_mp = *mp;

443

460

{

444

461

int size = ht->size;

445

462

struct mapping *mappings = ht->mappings;

463

hashfun_t hasher = ht->hash_function;

446

464

447

mp->key = NULL;

465

MARK_AS_EMPTY (mp);

448

466

--ht->count;

449

467

450

468

/* Rehash all the entries following MP. The alternative

451

469

approach is to mark the entry as deleted, i.e. create a

452

"tombstone". That makes remove faster, but leaves a lot of

470

"tombstone". That speeds up removal, but leaves a lot of

453

471

garbage and slows down hash_table_get and hash_table_put. */

454

472

455

473

mp = NEXT_MAPPING (mp, mappings, size);

456

474

LOOP_NON_EMPTY (mp, mappings, size)

457

475

{

458

476

const void *key2 = mp->key;

459

struct mapping *mp_new = mappings + HASH_POSITION (ht, key2);

477

struct mapping *mp_new;

460

478

461

479

/* Find the new location for the key. */

462

480

mp_new = mappings + HASH_POSITION (key2, hasher, size);

463

481

LOOP_NON_EMPTY (mp_new, mappings, size)

464

482

if (key2 == mp_new->key)

465

483

/* The mapping MP (key2) is already where we want it (in

467

485

goto next_rehash;

468

486

469

487

*mp_new = *mp;

470

mp->key = NULL;

488

MARK_AS_EMPTY (mp);

471

489

472

490

next_rehash:

473

491

;

483

501

void

484

502

hash_table_clear (struct hash_table *ht)

485

503

{

486

memset (ht->mappings, '\0', ht->size * sizeof (struct mapping));

504

memset (ht->mappings, INVALID_PTR_BYTE, ht->size * sizeof (struct mapping));

487

505

ht->count = 0;

488

506

}

489

507

532

550

don't strictly belong to this file. However, this is as good a

533

551

place for them as any. */

534

552

553

/* Guidelines for creating custom hash and test functions:

554

555

- The test function returns non-zero for keys that are considered

556

"equal", zero otherwise.

557

558

- The hash function returns a number that represents the

559

"distinctness" of the object. In more precise terms, it means

560

that for any two objects that test "equal" under the test

561

function, the hash function MUST produce the same result.

562

563

This does not mean that all different objects must produce

564

different values (that would be "perfect" hashing), only that

565

non-distinct objects must produce the same values! For instance,

566

a hash function that returns 0 for any given object is a

567

perfectly valid (albeit extremely bad) hash function. A hash

568

function that hashes a string by adding up all its characters is

569

another example of a valid (but quite bad) hash function.

570

571

It is not hard to make hash and test functions agree about

572

equality. For example, if the test function compares strings

573

case-insensitively, the hash function can lower-case the

574

characters when calculating the hash value. That ensures that

575

two strings differing only in case will hash the same.

576

577

- If you care about performance, choose a hash function with as

578

good "spreading" as possible. A good hash function will use all

579

the bits of the input when calculating the hash, and will react

580

to even small changes in input with a completely different

581

output. Finally, don't make the hash function itself overly

582

slow, because you'll be incurring a non-negligible overhead to

583

all hash table operations. */

584

535

585

536

586

* Support for hash tables whose keys are strings.

537

587

543

593

We used to use the popular hash function from the Dragon Book, but

544

594

this one seems to perform much better. */

545

595

546

unsigned long

547

string_hash (const void *key)

596

static unsigned long

597

hash_string (const void *key)

548

598

{

549

599

const char *p = key;

550

600

unsigned int h = *p;

558

608

559

609

/* Frontend for strcmp usable for hash tables. */

560

610

561

int

562

string_cmp (const void *s1, const void *s2)

611

static int

612

cmp_string (const void *s1, const void *s2)

563

613

{

564

614

return !strcmp ((const char *)s1, (const char *)s2);

565

615

}

570

620

struct hash_table *

571

621

make_string_hash_table (int items)

572

622

{

573

return hash_table_new (items, string_hash, string_cmp);

623

return hash_table_new (items, hash_string, cmp_string);

574

624

}

575

625

576

626

579

629

580

630

581

631

582

/* Like string_hash, but produce the same hash regardless of the case. */

632

/* Like hash_string, but produce the same hash regardless of the case. */

583

633

584

634

static unsigned long

585

string_hash_nocase (const void *key)

635

hash_string_nocase (const void *key)

586

636

{

587

637

const char *p = key;

588

638

unsigned int h = TOLOWER (*p);

608

658

struct hash_table *

609

659

make_nocase_string_hash_table (int items)

610

660

{

611

return hash_table_new (items, string_hash_nocase, string_cmp_nocase);

661

return hash_table_new (items, hash_string_nocase, string_cmp_nocase);

612

662

}

613

663

614

/* Hashing of pointers. Used for hash tables that are keyed by

615

pointer identity. (Common Lisp calls them EQ hash tables, and Java

616

calls them IdentityHashMaps.) */

664

/* Hashing of numeric values, such as pointers and integers.

665

666

This implementation is the Robert Jenkins' 32 bit Mix Function,

667

with a simple adaptation for 64-bit values. It offers excellent

668

spreading of values and doesn't need to know the hash table size to

669

work (unlike the very popular Knuth's multiplication hash). */

617

670

618

671

unsigned long

619

ptrhash (const void *ptr)

672

hash_pointer (const void *ptr)

620

673

{

621

674

unsigned long key = (unsigned long)ptr;

622

675

key += (key << 12);

640

693

return key;

641

694

}

642

695

643

int

644

ptrcmp (const void *ptr1, const void *ptr2)

696

static int

697

cmp_pointer (const void *ptr1, const void *ptr2)

645

698

{

646

699

return ptr1 == ptr2;

647

700

}

648

649

#if 0

650

/* Currently unused: hashing of integers. */

651

652

unsigned long

653

inthash (unsigned int key)

654

{

655

key += (key << 12);

656

key ^= (key >> 22);

657

key += (key << 4);

658

key ^= (key >> 9);

659

key += (key << 10);

660

key ^= (key >> 2);

661

key += (key << 7);

662

key ^= (key >> 12);

663

return key;

664

}

665

#endif

666

701

667

#ifdef STANDALONE

702

#ifdef TEST

668

703

669

704

#include <stdio.h>

670

705

#include <string.h>

718

753

#endif

719

754

return 0;

720

755

}

721

#endif

756

#endif /* TEST */

Older »