2
Copyright (c) 2011 Giorgio Gonnella <gonnella@zbh.uni-hamburg.de>
3
Copyright (c) 2011 Center for Bioinformatics, University of Hamburg
5
Permission to use, copy, modify, and distribute this software for any
6
purpose with or without fee is hereby granted, provided that the above
7
copyright notice and this permission notice appear in all copies.
9
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
25
#include "core/safearith.h"
26
#include "extended/md5set.h"
27
#include "extended/md5set_primes_table.h"
28
#include "extended/reverse.h"
34
#define GT_MD5_T_EQUAL(A,B) \
35
((A).l == (B).l && (A).h == (B).h)
37
#define GT_MD5_T_IS_EMPTY(A) \
38
((A).l == 0 && (A).h == 0)
40
#define GT_MD5SET_TOO_LARGE \
41
"fatal: no prime number larger than %llu in lookup table\n" \
42
"developers: modify scripts/makeprimestable.sh to create a larger table\n"
45
/* if n is in lookup table return it;
46
* otherwise return first element in lookup table larger than n */
47
static unsigned long gt_md5set_get_size(unsigned long n)
49
unsigned long long u, l, i, k, k_i;
51
k = (unsigned long long)n;
52
if (k > GT_MD5SET_LARGEST_PRIME)
54
fprintf(stderr, GT_MD5SET_TOO_LARGE, k);
57
if (k < gt_md5set_primes[0])
58
return (unsigned long)gt_md5set_primes[0];
60
u = GT_MD5SET_NOFPRIMES - 1;
66
return (gt_md5set_primes[l] == k)
68
: (unsigned long)gt_md5set_primes[u];
70
k_i = gt_md5set_primes[i];
81
return (unsigned long)k_i;
94
unsigned long maxfill;
95
/* string temp buffer */
97
unsigned long bufsize;
100
#define GT_MD5SET_PREPARE_INSERTION(SET) \
102
if ((SET)->fill > (SET)->maxfill) \
103
gt_md5set_alloc_table((SET), \
104
/* using alloc + 1, the next value in the lookup table is returned */ \
105
gt_md5set_get_size((SET)->alloc + 1));\
106
gt_assert((SET)->fill <= (SET)->maxfill)
108
static void gt_md5set_alloc_table(GtMD5Set *set, unsigned long newsize);
110
enum GtMD5SetSearchResult
117
static inline enum GtMD5SetSearchResult gt_md5set_search_pos(GtMD5Set *set,
118
gt_md5_t k, bool insert_if_not_found, unsigned long i)
120
if (GT_MD5_T_IS_EMPTY(set->table[i]))
122
if (insert_if_not_found)
124
GT_MD5SET_PREPARE_INSERTION(set);
127
return GT_MD5SET_EMPTY;
129
return (GT_MD5_T_EQUAL(k, set->table[i]))
130
? GT_MD5SET_KEY_FOUND
131
: GT_MD5SET_COLLISION;
134
#define GT_MD5SET_H1(MD5, TABLE_SIZE) \
135
((MD5).l % (TABLE_SIZE))
137
/* h2 result must be relatively prime to table_size;
138
any value 1 < x <= table_size - 1 is it, because table_size is prime */
139
#define GT_MD5SET_H2(MD5, TABLE_SIZE) \
140
(((MD5).h % ((TABLE_SIZE) - 1)) + 1)
142
static bool gt_md5set_search(GtMD5Set *set, gt_md5_t k,
143
bool insert_if_not_found)
147
unsigned long first_i;
149
enum GtMD5SetSearchResult retval;
151
i = (unsigned long)GT_MD5SET_H1(k, set->alloc);
152
retval = gt_md5set_search_pos(set, k, insert_if_not_found, i);
153
if (retval != GT_MD5SET_COLLISION)
154
return retval == GT_MD5SET_EMPTY ? false : true;
156
/* open addressing by double hashing */
157
c = (unsigned long)GT_MD5SET_H2(k, set->alloc);
164
i = (i + c) % set->alloc;
165
gt_assert(i != first_i);
166
retval = gt_md5set_search_pos(set, k, insert_if_not_found, i);
167
if (retval != GT_MD5SET_COLLISION)
168
return retval == GT_MD5SET_EMPTY ? false : true;
172
static void gt_md5set_rehash(GtMD5Set *set, gt_md5_t *oldtable,
173
unsigned long oldsize)
177
for (i = 0; i < oldsize; i++)
178
if (!GT_MD5_T_IS_EMPTY(oldtable[i]))
179
(void)gt_md5set_search(set, oldtable[i], true);
182
#define GT_MD5SET_MAX_LOAD_FACTOR 0.8
184
static void gt_md5set_alloc_table(GtMD5Set *set, unsigned long newsize)
187
unsigned long oldsize;
189
oldsize = set->alloc;
190
oldtable = set->table;
191
set->alloc = newsize;
193
(unsigned long)((double)newsize * GT_MD5SET_MAX_LOAD_FACTOR);
194
set->table = gt_calloc((size_t)newsize, sizeof (gt_md5_t));
195
if (oldtable != NULL)
197
gt_log_log("rehashing %lu elements; old size: %lu, new size: %lu\n",
198
set->fill, oldsize, newsize);
199
gt_md5set_rehash(set, oldtable, oldsize);
204
GtMD5Set *gt_md5set_new(unsigned long nof_elements)
207
md5set = gt_malloc(sizeof (GtMD5Set));
210
md5set->table = NULL;
211
gt_md5set_alloc_table(md5set, gt_md5set_get_size(nof_elements +
212
(nof_elements >> 2)));
213
gt_assert(nof_elements < md5set->maxfill);
214
md5set->buffer = NULL;
219
void gt_md5set_delete(GtMD5Set *set)
224
gt_free(set->buffer);
229
static void gt_md5set_prepare_buffer(GtMD5Set *md5set, unsigned long bufsize)
231
gt_assert(md5set != NULL);
233
if (md5set->buffer == NULL)
235
md5set->buffer = gt_malloc(sizeof (char) * bufsize);
236
md5set->bufsize = bufsize;
238
else if (md5set->bufsize < bufsize)
240
md5set->buffer = gt_realloc(md5set->buffer, sizeof (char) * bufsize);
241
md5set->bufsize = bufsize;
245
#define GT_MD5SET_HASH_STRING(BUF, LEN, MD5) \
246
md5((BUF), gt_safe_cast2long(LEN), (char*)&(MD5))
248
GtMD5SetStatus gt_md5set_add_sequence(GtMD5Set *set, const char* seq,
249
unsigned long seqlen, bool both_strands,
252
gt_md5_t md5sum, md5sum_rc;
257
gt_assert(set != NULL);
258
gt_assert(set->table != NULL);
260
gt_md5set_prepare_buffer(set, seqlen);
261
for (i = 0; i < seqlen; i++)
262
set->buffer[i] = toupper(seq[i]);
264
GT_MD5SET_HASH_STRING(set->buffer, seqlen, md5sum);
265
found = gt_md5set_search(set, md5sum, true);
267
return GT_MD5SET_FOUND;
271
retval = gt_reverse_complement(set->buffer, seqlen, err);
274
gt_assert(retval < 0);
275
return GT_MD5SET_ERROR;
278
GT_MD5SET_HASH_STRING(set->buffer, seqlen, md5sum_rc);
279
found = gt_md5set_search(set, md5sum_rc, false);
281
return GT_MD5SET_RC_FOUND;
284
return GT_MD5SET_NOT_FOUND;