2
Copyright (c) 2008-2010 Sascha Steinbiss <steinbiss@zbh.uni-hamburg.de>
3
Copyright (c) 2008-2010 Center for Bioinformatics, University of Hamburg
5
Permission to use, copy, modify, and distribute this software for any
6
purpose with or without fee is hereby granted, provided that the above
7
copyright notice and this permission notice appear in all copies.
9
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
19
#include "core/assert_api.h"
20
#include "core/class_alloc.h"
21
#include "core/array.h"
22
#include "core/ensure.h"
24
#include "core/thread.h"
25
#include "core/unused_api.h"
26
#include "annotationsketch/feature_index_rep.h"
27
#include "annotationsketch/feature_visitor.h"
28
#include "extended/feature_node.h"
29
#ifdef GT_THREADS_ENABLED
30
#include "extended/genome_node.h"
32
#include "extended/genome_node.h"
33
#include "extended/gff3_in_stream.h"
35
struct GtFeatureIndexClass {
37
GtFeatureIndexAddRegionNodeFunc add_region_node;
38
GtFeatureIndexAddFeatureNodeFunc add_feature_node;
39
GtFeatureIndexGetFeatsForSeqidFunc get_features_for_seqid;
40
GtFeatureIndexGetFeatsForRangeFunc get_features_for_range;
41
GtFeatureIndexGetFirstSeqidFunc get_first_seqid;
42
GtFeatureIndexGetSeqidsFunc get_seqids;
43
GtFeatureIndexGetRangeForSeqidFunc get_range_for_seqid;
44
GtFeatureIndexHasSeqidFunc has_seqid;
45
GtFeatureIndexFreeFunc free;
48
struct GtFeatureIndexMembers {
49
unsigned int reference_count;
53
const GtFeatureIndexClass* gt_feature_index_class_new(size_t size,
54
GtFeatureIndexAddRegionNodeFunc
56
GtFeatureIndexAddFeatureNodeFunc
58
GtFeatureIndexGetFeatsForSeqidFunc
59
get_features_for_seqid,
60
GtFeatureIndexGetFeatsForRangeFunc
61
get_features_for_range,
62
GtFeatureIndexGetFirstSeqidFunc
64
GtFeatureIndexGetSeqidsFunc
66
GtFeatureIndexGetRangeForSeqidFunc
68
GtFeatureIndexHasSeqidFunc
70
GtFeatureIndexFreeFunc
73
GtFeatureIndexClass *c_class = gt_class_alloc(sizeof *c_class);
75
c_class->add_region_node = add_region_node;
76
c_class->add_feature_node = add_feature_node;
77
c_class->get_features_for_seqid = get_features_for_seqid;
78
c_class->get_features_for_range = get_features_for_range;
79
c_class->get_first_seqid = get_first_seqid;
80
c_class->get_seqids = get_seqids;
81
c_class->get_range_for_seqid = get_range_for_seqid;
82
c_class->has_seqid = has_seqid;
87
GtFeatureIndex* gt_feature_index_create(const GtFeatureIndexClass *fic)
90
gt_assert(fic && fic->size);
91
fi = gt_calloc(1, fic->size);
93
fi->pvt = gt_calloc(1, sizeof (GtFeatureIndexMembers));
94
fi->pvt->lock = gt_rwlock_new();
98
GtFeatureIndex* gt_feature_index_ref(GtFeatureIndex *fi)
101
gt_rwlock_wrlock(fi->pvt->lock);
102
fi->pvt->reference_count++;
103
gt_rwlock_unlock(fi->pvt->lock);
107
void gt_feature_index_delete(GtFeatureIndex *fi)
110
gt_rwlock_wrlock(fi->pvt->lock);
111
if (fi->pvt->reference_count) {
112
fi->pvt->reference_count--;
113
gt_rwlock_unlock(fi->pvt->lock);
116
gt_assert(fi->c_class);
117
if (fi->c_class->free)
118
fi->c_class->free(fi);
119
gt_rwlock_unlock(fi->pvt->lock);
120
gt_rwlock_delete(fi->pvt->lock);
125
void gt_feature_index_add_region_node(GtFeatureIndex *feature_index,
126
GtRegionNode *region_node)
128
gt_assert(feature_index && feature_index->c_class && region_node);
129
gt_rwlock_wrlock(feature_index->pvt->lock);
130
feature_index->c_class->add_region_node(feature_index, region_node);
131
gt_rwlock_unlock(feature_index->pvt->lock);
134
void gt_feature_index_add_feature_node(GtFeatureIndex *feature_index,
135
GtFeatureNode *feature_node)
137
gt_assert(feature_index && feature_index->c_class && feature_node);
138
gt_rwlock_wrlock(feature_index->pvt->lock);
139
feature_index->c_class->add_feature_node(feature_index, feature_node);
140
gt_rwlock_unlock(feature_index->pvt->lock);
143
int gt_feature_index_add_gff3file(GtFeatureIndex *feature_index,
144
const char *gff3file, GtError *err)
146
GtNodeStream *gff3_in_stream;
152
gt_assert(feature_index && gff3file);
153
tmp = gt_array_new(sizeof (GtGenomeNode*));
154
gff3_in_stream = gt_gff3_in_stream_new_unsorted(1, &gff3file);
155
while (!(had_err = gt_node_stream_next(gff3_in_stream, &gn, err)) && gn)
156
gt_array_add(tmp, gn);
158
GtNodeVisitor *feature_visitor = gt_feature_visitor_new(feature_index);
159
for (i=0;i<gt_array_size(tmp);i++) {
160
gn = *(GtGenomeNode**) gt_array_get(tmp, i);
161
/* no need to lock, add_*_node() is synchronized */
162
had_err = gt_genome_node_accept(gn, feature_visitor, NULL);
163
gt_assert(!had_err); /* cannot happen */
165
gt_node_visitor_delete(feature_visitor);
167
gt_node_stream_delete(gff3_in_stream);
168
for (i=0;i<gt_array_size(tmp);i++)
169
gt_genome_node_delete(*(GtGenomeNode**) gt_array_get(tmp, i));
170
gt_array_delete(tmp);
174
GtArray* gt_feature_index_get_features_for_seqid(GtFeatureIndex *fi,
178
gt_assert(fi && fi->c_class && seqid);
179
gt_rwlock_rdlock(fi->pvt->lock);
180
arr = fi->c_class->get_features_for_seqid(fi, seqid);
181
gt_rwlock_unlock(fi->pvt->lock);
185
int gt_feature_index_get_features_for_range(GtFeatureIndex *feature_index,
188
const GtRange *range, GtError *err)
191
gt_assert(feature_index && feature_index->c_class && results && seqid &&
193
gt_assert(gt_range_length(range) > 0);
194
gt_rwlock_rdlock(feature_index->pvt->lock);
195
ret = feature_index->c_class->get_features_for_range(feature_index, results,
197
gt_rwlock_unlock(feature_index->pvt->lock);
201
const char* gt_feature_index_get_first_seqid(const GtFeatureIndex
205
gt_assert(feature_index && feature_index->c_class);
206
gt_rwlock_rdlock(feature_index->pvt->lock);
207
str = feature_index->c_class->get_first_seqid(feature_index);
208
gt_rwlock_unlock(feature_index->pvt->lock);
212
GtStrArray* gt_feature_index_get_seqids(const GtFeatureIndex *feature_index)
215
gt_assert(feature_index && feature_index->c_class);
216
gt_rwlock_rdlock(feature_index->pvt->lock);
217
strarr = feature_index->c_class->get_seqids(feature_index);
218
gt_rwlock_unlock(feature_index->pvt->lock);
222
void gt_feature_index_get_range_for_seqid(GtFeatureIndex *feature_index,
223
GtRange *range, const char *seqid)
225
gt_assert(feature_index && feature_index->c_class && range && seqid);
226
gt_rwlock_rdlock(feature_index->pvt->lock);
227
feature_index->c_class->get_range_for_seqid(feature_index, range, seqid);
228
gt_rwlock_unlock(feature_index->pvt->lock);
231
bool gt_feature_index_has_seqid(const GtFeatureIndex *feature_index,
235
gt_assert(feature_index && feature_index->c_class && seqid);
236
gt_rwlock_rdlock(feature_index->pvt->lock);
237
ret = feature_index->c_class->has_seqid(feature_index, seqid);
238
gt_rwlock_unlock(feature_index->pvt->lock);
242
void* gt_feature_index_cast(GT_UNUSED const GtFeatureIndexClass *fic,
245
gt_assert(fic && fi && fi->c_class == fic);
249
#define GT_FI_TEST_FEATURES_PER_THREAD 1000
250
#define GT_FI_TEST_START 1
251
#define GT_FI_TEST_END 10000000
252
#define GT_FI_TEST_FEATURE_WIDTH 2000
253
#define GT_FI_TEST_QUERY_WIDTH 50000
254
#define GT_FI_TEST_SEQID "testseqid"
261
unsigned long next_node_idx,
263
} GtFeatureIndexTestShared;
265
static void* gt_feature_index_unit_test_add(void *data)
267
GtFeatureNode *feature;
268
GtFeatureIndexTestShared *shm = (GtFeatureIndexTestShared*) data;
271
gt_mutex_lock(shm->mutex);
272
if (shm->next_node_idx == GT_FI_TEST_FEATURES_PER_THREAD * gt_jobs) {
273
gt_mutex_unlock(shm->mutex);
276
gt_assert(shm->next_node_idx < gt_array_size(shm->nodes));
277
feature = *(GtFeatureNode**) gt_array_get(shm->nodes, shm->next_node_idx++);
278
gt_mutex_unlock(shm->mutex);
279
gt_feature_index_add_feature_node(shm->fi, feature);
280
gt_genome_node_delete((GtGenomeNode*) feature);
285
static int cmp_range_start(const void *v1, const void *v2)
287
return gt_genome_node_compare((GtGenomeNode**) v1, (GtGenomeNode**) v2);
290
static void* gt_feature_index_unit_test_query(void *data)
292
GtFeatureIndexTestShared *shm = (GtFeatureIndexTestShared*) data;
294
GtError *err = shm->err;
297
GtArray *arr, *arr_ref;
299
gt_mutex_lock(shm->mutex);
300
if (gt_error_is_set(shm->err)) {
301
gt_mutex_unlock(shm->mutex);
304
gt_mutex_unlock(shm->mutex);
306
arr = gt_array_new(sizeof (GtFeatureNode*));
307
arr_ref = gt_array_new(sizeof (GtFeatureNode*));
308
rng.start = random() % (GT_FI_TEST_END - GT_FI_TEST_QUERY_WIDTH);
309
rng.end = rng.start + random() % (GT_FI_TEST_QUERY_WIDTH);
311
/* get reference set by linear search */
312
gt_mutex_lock(shm->mutex);
313
for (i=0;i<GT_FI_TEST_FEATURES_PER_THREAD * gt_jobs;i++) {
316
fn = *(GtFeatureNode**) gt_array_get(shm->nodes, i);
317
rng2 = gt_genome_node_get_range((GtGenomeNode*) fn);
318
if (gt_range_overlap(&rng, &rng2)) {
319
gt_array_add(arr_ref, fn);
322
gt_mutex_unlock(shm->mutex);
324
/* query feature index */
325
gt_feature_index_get_features_for_range(shm->fi, arr, GT_FI_TEST_SEQID, &rng,
328
/* result size must be equal */
329
if (gt_array_size(arr) != gt_array_size(arr_ref))
332
/* nodes must be the same (note that we should not rely on ptr equality) */
334
gt_array_sort(arr_ref, cmp_range_start);
335
gt_array_sort(arr , cmp_range_start);
337
for (i=0;i<gt_array_size(arr);i++) {
340
if (!gt_feature_node_is_similar(*(GtFeatureNode**) gt_array_get(arr, i),
342
gt_array_get(arr_ref, i))) {
349
gt_mutex_lock(shm->mutex);
351
gt_mutex_unlock(shm->mutex);
354
gt_array_delete(arr);
355
gt_array_delete(arr_ref);
359
/* to be called from implementing class! */
360
int gt_feature_index_unit_test(GtFeatureIndex *fi, GtError *err)
363
GtFeatureIndexTestShared sh;
370
sh.mutex = gt_mutex_new();
371
sh.nodes = gt_array_new(sizeof (GtFeatureNode*));
373
sh.next_node_idx = 0;
378
seqid = gt_str_new_cstr(GT_FI_TEST_SEQID);
379
rn = (GtRegionNode*) gt_region_node_new(seqid, GT_FI_TEST_START,
382
/* test seqid is not supposed to exist */
383
gt_ensure(had_err, !gt_feature_index_has_seqid(sh.fi, GT_FI_TEST_SEQID));
385
/* add a sequence region directly and check if it has been added */
386
gt_feature_index_add_region_node(sh.fi, rn);
387
gt_genome_node_delete((GtGenomeNode*) rn);
388
gt_ensure(had_err, gt_feature_index_has_seqid(sh.fi, GT_FI_TEST_SEQID));
390
gt_feature_index_get_range_for_seqid(sh.fi, &check_range, GT_FI_TEST_SEQID);
391
gt_ensure(had_err, check_range.start == GT_FI_TEST_START
392
&& check_range.end == GT_FI_TEST_END);
394
/* set up nodes to store */
395
for (i=0;i<GT_FI_TEST_FEATURES_PER_THREAD*gt_jobs;i++) {
396
unsigned long start, end;
398
start = random() % (GT_FI_TEST_END - GT_FI_TEST_FEATURE_WIDTH);
399
end = start + random() % (GT_FI_TEST_FEATURE_WIDTH);
400
fn = gt_feature_node_cast(gt_feature_node_new(seqid, "gene", start, end,
402
gt_array_add(sh.nodes, fn);
405
/* test parallel addition */
406
gt_multithread(gt_feature_index_unit_test_add, &sh, err);
407
seqids = gt_feature_index_get_seqids(fi);
408
gt_ensure(had_err, seqids);
409
gt_ensure(had_err, gt_feature_index_has_seqid(fi, GT_FI_TEST_SEQID));
410
gt_ensure(had_err, gt_str_array_size(seqids) == 1);
412
/* test parallel query */
414
gt_multithread(gt_feature_index_unit_test_query, &sh, err);
415
gt_ensure(had_err, sh.error_count == 0);
417
gt_mutex_delete(sh.mutex);
418
gt_str_array_delete(seqids);
419
gt_array_delete(sh.nodes);
420
gt_str_delete(seqid);