~ubuntu-branches/ubuntu/quantal/genometools/quantal-backports

« back to all changes in this revision

Viewing changes to src/annotationsketch/feature_index.c

  • Committer: Package Import Robot
  • Author(s): Sascha Steinbiss
  • Date: 2012-07-09 14:10:23 UTC
  • Revision ID: package-import@ubuntu.com-20120709141023-juuu4spm6chqsf9o
Tags: upstream-1.4.1
ImportĀ upstreamĀ versionĀ 1.4.1

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*
 
2
  Copyright (c) 2008-2010 Sascha Steinbiss <steinbiss@zbh.uni-hamburg.de>
 
3
  Copyright (c) 2008-2010 Center for Bioinformatics, University of Hamburg
 
4
 
 
5
  Permission to use, copy, modify, and distribute this software for any
 
6
  purpose with or without fee is hereby granted, provided that the above
 
7
  copyright notice and this permission notice appear in all copies.
 
8
 
 
9
  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 
10
  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 
11
  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 
12
  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 
13
  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 
14
  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 
15
  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
16
*/
 
17
 
 
18
#include <string.h>
 
19
#include "core/assert_api.h"
 
20
#include "core/class_alloc.h"
 
21
#include "core/array.h"
 
22
#include "core/ensure.h"
 
23
#include "core/ma.h"
 
24
#include "core/thread.h"
 
25
#include "core/unused_api.h"
 
26
#include "annotationsketch/feature_index_rep.h"
 
27
#include "annotationsketch/feature_visitor.h"
 
28
#include "extended/feature_node.h"
 
29
#ifdef GT_THREADS_ENABLED
 
30
#include "extended/genome_node.h"
 
31
#endif
 
32
#include "extended/genome_node.h"
 
33
#include "extended/gff3_in_stream.h"
 
34
 
 
35
struct GtFeatureIndexClass {
 
36
  size_t size;
 
37
  GtFeatureIndexAddRegionNodeFunc add_region_node;
 
38
  GtFeatureIndexAddFeatureNodeFunc add_feature_node;
 
39
  GtFeatureIndexGetFeatsForSeqidFunc get_features_for_seqid;
 
40
  GtFeatureIndexGetFeatsForRangeFunc get_features_for_range;
 
41
  GtFeatureIndexGetFirstSeqidFunc get_first_seqid;
 
42
  GtFeatureIndexGetSeqidsFunc get_seqids;
 
43
  GtFeatureIndexGetRangeForSeqidFunc get_range_for_seqid;
 
44
  GtFeatureIndexHasSeqidFunc has_seqid;
 
45
  GtFeatureIndexFreeFunc free;
 
46
};
 
47
 
 
48
struct GtFeatureIndexMembers {
 
49
  unsigned int reference_count;
 
50
  GtRWLock *lock;
 
51
};
 
52
 
 
53
const GtFeatureIndexClass* gt_feature_index_class_new(size_t size,
 
54
                                         GtFeatureIndexAddRegionNodeFunc
 
55
                                                 add_region_node,
 
56
                                         GtFeatureIndexAddFeatureNodeFunc
 
57
                                                 add_feature_node,
 
58
                                         GtFeatureIndexGetFeatsForSeqidFunc
 
59
                                                 get_features_for_seqid,
 
60
                                         GtFeatureIndexGetFeatsForRangeFunc
 
61
                                                 get_features_for_range,
 
62
                                         GtFeatureIndexGetFirstSeqidFunc
 
63
                                                 get_first_seqid,
 
64
                                         GtFeatureIndexGetSeqidsFunc
 
65
                                                 get_seqids,
 
66
                                         GtFeatureIndexGetRangeForSeqidFunc
 
67
                                                 get_range_for_seqid,
 
68
                                         GtFeatureIndexHasSeqidFunc
 
69
                                                 has_seqid,
 
70
                                         GtFeatureIndexFreeFunc
 
71
                                                 free)
 
72
{
 
73
  GtFeatureIndexClass *c_class = gt_class_alloc(sizeof *c_class);
 
74
  c_class->size = size;
 
75
  c_class->add_region_node = add_region_node;
 
76
  c_class->add_feature_node = add_feature_node;
 
77
  c_class->get_features_for_seqid = get_features_for_seqid;
 
78
  c_class->get_features_for_range = get_features_for_range;
 
79
  c_class->get_first_seqid = get_first_seqid;
 
80
  c_class->get_seqids = get_seqids;
 
81
  c_class->get_range_for_seqid = get_range_for_seqid;
 
82
  c_class->has_seqid = has_seqid;
 
83
  c_class->free = free;
 
84
  return c_class;
 
85
}
 
86
 
 
87
GtFeatureIndex* gt_feature_index_create(const GtFeatureIndexClass *fic)
 
88
{
 
89
  GtFeatureIndex *fi;
 
90
  gt_assert(fic && fic->size);
 
91
  fi = gt_calloc(1, fic->size);
 
92
  fi->c_class = fic;
 
93
  fi->pvt = gt_calloc(1, sizeof (GtFeatureIndexMembers));
 
94
  fi->pvt->lock = gt_rwlock_new();
 
95
  return fi;
 
96
}
 
97
 
 
98
GtFeatureIndex* gt_feature_index_ref(GtFeatureIndex *fi)
 
99
{
 
100
  gt_assert(fi);
 
101
  gt_rwlock_wrlock(fi->pvt->lock);
 
102
  fi->pvt->reference_count++;
 
103
  gt_rwlock_unlock(fi->pvt->lock);
 
104
  return fi;
 
105
}
 
106
 
 
107
void gt_feature_index_delete(GtFeatureIndex *fi)
 
108
{
 
109
  if (!fi) return;
 
110
  gt_rwlock_wrlock(fi->pvt->lock);
 
111
  if (fi->pvt->reference_count) {
 
112
    fi->pvt->reference_count--;
 
113
    gt_rwlock_unlock(fi->pvt->lock);
 
114
    return;
 
115
  }
 
116
  gt_assert(fi->c_class);
 
117
  if (fi->c_class->free)
 
118
    fi->c_class->free(fi);
 
119
  gt_rwlock_unlock(fi->pvt->lock);
 
120
  gt_rwlock_delete(fi->pvt->lock);
 
121
  gt_free(fi->pvt);
 
122
  gt_free(fi);
 
123
}
 
124
 
 
125
void gt_feature_index_add_region_node(GtFeatureIndex *feature_index,
 
126
                                      GtRegionNode *region_node)
 
127
{
 
128
  gt_assert(feature_index && feature_index->c_class && region_node);
 
129
  gt_rwlock_wrlock(feature_index->pvt->lock);
 
130
  feature_index->c_class->add_region_node(feature_index, region_node);
 
131
  gt_rwlock_unlock(feature_index->pvt->lock);
 
132
}
 
133
 
 
134
void gt_feature_index_add_feature_node(GtFeatureIndex *feature_index,
 
135
                                       GtFeatureNode *feature_node)
 
136
{
 
137
  gt_assert(feature_index && feature_index->c_class && feature_node);
 
138
  gt_rwlock_wrlock(feature_index->pvt->lock);
 
139
  feature_index->c_class->add_feature_node(feature_index, feature_node);
 
140
  gt_rwlock_unlock(feature_index->pvt->lock);
 
141
}
 
142
 
 
143
int gt_feature_index_add_gff3file(GtFeatureIndex *feature_index,
 
144
                                  const char *gff3file, GtError *err)
 
145
{
 
146
  GtNodeStream *gff3_in_stream;
 
147
  GtGenomeNode *gn;
 
148
  GtArray *tmp;
 
149
  int had_err = 0;
 
150
  unsigned long i;
 
151
  gt_error_check(err);
 
152
  gt_assert(feature_index && gff3file);
 
153
  tmp = gt_array_new(sizeof (GtGenomeNode*));
 
154
  gff3_in_stream = gt_gff3_in_stream_new_unsorted(1, &gff3file);
 
155
  while (!(had_err = gt_node_stream_next(gff3_in_stream, &gn, err)) && gn)
 
156
    gt_array_add(tmp, gn);
 
157
  if (!had_err) {
 
158
    GtNodeVisitor *feature_visitor = gt_feature_visitor_new(feature_index);
 
159
    for (i=0;i<gt_array_size(tmp);i++) {
 
160
      gn = *(GtGenomeNode**) gt_array_get(tmp, i);
 
161
      /* no need to lock, add_*_node() is synchronized */
 
162
      had_err = gt_genome_node_accept(gn, feature_visitor, NULL);
 
163
      gt_assert(!had_err); /* cannot happen */
 
164
    }
 
165
    gt_node_visitor_delete(feature_visitor);
 
166
  }
 
167
  gt_node_stream_delete(gff3_in_stream);
 
168
  for (i=0;i<gt_array_size(tmp);i++)
 
169
    gt_genome_node_delete(*(GtGenomeNode**) gt_array_get(tmp, i));
 
170
  gt_array_delete(tmp);
 
171
  return had_err;
 
172
}
 
173
 
 
174
GtArray* gt_feature_index_get_features_for_seqid(GtFeatureIndex *fi,
 
175
                                                 const char *seqid)
 
176
{
 
177
  GtArray *arr;
 
178
  gt_assert(fi && fi->c_class && seqid);
 
179
  gt_rwlock_rdlock(fi->pvt->lock);
 
180
  arr = fi->c_class->get_features_for_seqid(fi, seqid);
 
181
  gt_rwlock_unlock(fi->pvt->lock);
 
182
  return arr;
 
183
}
 
184
 
 
185
int gt_feature_index_get_features_for_range(GtFeatureIndex *feature_index,
 
186
                                            GtArray *results,
 
187
                                            const char *seqid,
 
188
                                            const GtRange *range, GtError *err)
 
189
{
 
190
  int ret;
 
191
  gt_assert(feature_index && feature_index->c_class && results && seqid &&
 
192
            range);
 
193
  gt_assert(gt_range_length(range) > 0);
 
194
  gt_rwlock_rdlock(feature_index->pvt->lock);
 
195
  ret = feature_index->c_class->get_features_for_range(feature_index, results,
 
196
                                                       seqid, range, err);
 
197
  gt_rwlock_unlock(feature_index->pvt->lock);
 
198
  return ret;
 
199
}
 
200
 
 
201
const char* gt_feature_index_get_first_seqid(const GtFeatureIndex
 
202
                                              *feature_index)
 
203
{
 
204
  const char *str;
 
205
  gt_assert(feature_index && feature_index->c_class);
 
206
  gt_rwlock_rdlock(feature_index->pvt->lock);
 
207
  str = feature_index->c_class->get_first_seqid(feature_index);
 
208
  gt_rwlock_unlock(feature_index->pvt->lock);
 
209
  return str;
 
210
}
 
211
 
 
212
GtStrArray* gt_feature_index_get_seqids(const GtFeatureIndex *feature_index)
 
213
{
 
214
  GtStrArray *strarr;
 
215
  gt_assert(feature_index && feature_index->c_class);
 
216
  gt_rwlock_rdlock(feature_index->pvt->lock);
 
217
  strarr = feature_index->c_class->get_seqids(feature_index);
 
218
  gt_rwlock_unlock(feature_index->pvt->lock);
 
219
  return strarr;
 
220
}
 
221
 
 
222
void gt_feature_index_get_range_for_seqid(GtFeatureIndex *feature_index,
 
223
                                          GtRange *range, const char *seqid)
 
224
{
 
225
  gt_assert(feature_index && feature_index->c_class && range && seqid);
 
226
  gt_rwlock_rdlock(feature_index->pvt->lock);
 
227
  feature_index->c_class->get_range_for_seqid(feature_index, range, seqid);
 
228
  gt_rwlock_unlock(feature_index->pvt->lock);
 
229
}
 
230
 
 
231
bool gt_feature_index_has_seqid(const GtFeatureIndex *feature_index,
 
232
                                const char *seqid)
 
233
{
 
234
  bool ret;
 
235
  gt_assert(feature_index && feature_index->c_class && seqid);
 
236
  gt_rwlock_rdlock(feature_index->pvt->lock);
 
237
  ret = feature_index->c_class->has_seqid(feature_index, seqid);
 
238
  gt_rwlock_unlock(feature_index->pvt->lock);
 
239
  return ret;
 
240
}
 
241
 
 
242
void* gt_feature_index_cast(GT_UNUSED const GtFeatureIndexClass *fic,
 
243
                            GtFeatureIndex *fi)
 
244
{
 
245
  gt_assert(fic && fi && fi->c_class == fic);
 
246
  return fi;
 
247
}
 
248
 
 
249
#define GT_FI_TEST_FEATURES_PER_THREAD 1000
 
250
#define GT_FI_TEST_START 1
 
251
#define GT_FI_TEST_END 10000000
 
252
#define GT_FI_TEST_FEATURE_WIDTH 2000
 
253
#define GT_FI_TEST_QUERY_WIDTH 50000
 
254
#define GT_FI_TEST_SEQID "testseqid"
 
255
 
 
256
typedef struct {
 
257
  GtFeatureIndex *fi;
 
258
  GtError *err;
 
259
  GtArray *nodes;
 
260
  GtMutex *mutex;
 
261
  unsigned long next_node_idx,
 
262
                error_count;
 
263
} GtFeatureIndexTestShared;
 
264
 
 
265
static void* gt_feature_index_unit_test_add(void *data)
 
266
{
 
267
  GtFeatureNode *feature;
 
268
  GtFeatureIndexTestShared *shm = (GtFeatureIndexTestShared*) data;
 
269
 
 
270
  while (true) {
 
271
    gt_mutex_lock(shm->mutex);
 
272
    if (shm->next_node_idx == GT_FI_TEST_FEATURES_PER_THREAD * gt_jobs) {
 
273
      gt_mutex_unlock(shm->mutex);
 
274
      return NULL;
 
275
    }
 
276
    gt_assert(shm->next_node_idx < gt_array_size(shm->nodes));
 
277
    feature = *(GtFeatureNode**) gt_array_get(shm->nodes, shm->next_node_idx++);
 
278
    gt_mutex_unlock(shm->mutex);
 
279
    gt_feature_index_add_feature_node(shm->fi, feature);
 
280
    gt_genome_node_delete((GtGenomeNode*) feature);
 
281
  }
 
282
  return NULL;
 
283
}
 
284
 
 
285
static int cmp_range_start(const void *v1, const void *v2)
 
286
{
 
287
  return gt_genome_node_compare((GtGenomeNode**) v1, (GtGenomeNode**) v2);
 
288
}
 
289
 
 
290
static void* gt_feature_index_unit_test_query(void *data)
 
291
{
 
292
  GtFeatureIndexTestShared *shm = (GtFeatureIndexTestShared*) data;
 
293
  GtRange rng;
 
294
  GtError *err = shm->err;
 
295
  unsigned long i;
 
296
  int had_err = 0;
 
297
  GtArray *arr, *arr_ref;
 
298
 
 
299
  gt_mutex_lock(shm->mutex);
 
300
  if (gt_error_is_set(shm->err)) {
 
301
    gt_mutex_unlock(shm->mutex);
 
302
    return NULL;
 
303
  }
 
304
  gt_mutex_unlock(shm->mutex);
 
305
 
 
306
  arr = gt_array_new(sizeof (GtFeatureNode*));
 
307
  arr_ref = gt_array_new(sizeof (GtFeatureNode*));
 
308
  rng.start = random() % (GT_FI_TEST_END - GT_FI_TEST_QUERY_WIDTH);
 
309
  rng.end = rng.start + random() % (GT_FI_TEST_QUERY_WIDTH);
 
310
 
 
311
  /* get reference set by linear search */
 
312
  gt_mutex_lock(shm->mutex);
 
313
  for (i=0;i<GT_FI_TEST_FEATURES_PER_THREAD * gt_jobs;i++) {
 
314
    GtRange rng2;
 
315
    GtFeatureNode *fn;
 
316
    fn = *(GtFeatureNode**) gt_array_get(shm->nodes, i);
 
317
    rng2 = gt_genome_node_get_range((GtGenomeNode*) fn);
 
318
    if (gt_range_overlap(&rng, &rng2)) {
 
319
      gt_array_add(arr_ref, fn);
 
320
    }
 
321
  }
 
322
  gt_mutex_unlock(shm->mutex);
 
323
 
 
324
  /* query feature index */
 
325
  gt_feature_index_get_features_for_range(shm->fi, arr, GT_FI_TEST_SEQID, &rng,
 
326
                                          err);
 
327
 
 
328
  /* result size must be equal */
 
329
  if (gt_array_size(arr) != gt_array_size(arr_ref))
 
330
    had_err = -1;
 
331
 
 
332
  /* nodes must be the same (note that we should not rely on ptr equality) */
 
333
  if (!had_err) {
 
334
    gt_array_sort(arr_ref, cmp_range_start);
 
335
    gt_array_sort(arr    , cmp_range_start);
 
336
 
 
337
    for (i=0;i<gt_array_size(arr);i++) {
 
338
      if (had_err)
 
339
        break;
 
340
      if (!gt_feature_node_is_similar(*(GtFeatureNode**) gt_array_get(arr, i),
 
341
                                      *(GtFeatureNode**)
 
342
                                      gt_array_get(arr_ref, i))) {
 
343
        had_err = -1;
 
344
      }
 
345
    }
 
346
  }
 
347
 
 
348
  if (had_err) {
 
349
    gt_mutex_lock(shm->mutex);
 
350
    shm->error_count++;
 
351
    gt_mutex_unlock(shm->mutex);
 
352
  }
 
353
 
 
354
  gt_array_delete(arr);
 
355
  gt_array_delete(arr_ref);
 
356
  return NULL;
 
357
}
 
358
 
 
359
/* to be called from implementing class! */
 
360
int gt_feature_index_unit_test(GtFeatureIndex *fi, GtError *err)
 
361
{
 
362
  int had_err = 0, i;
 
363
  GtFeatureIndexTestShared sh;
 
364
  GtStrArray *seqids;
 
365
  GtStr *seqid;
 
366
  GtRange check_range;
 
367
  GtRegionNode *rn;
 
368
  gt_error_check(err);
 
369
 
 
370
  sh.mutex = gt_mutex_new();
 
371
  sh.nodes = gt_array_new(sizeof (GtFeatureNode*));
 
372
  sh.error_count = 0;
 
373
  sh.next_node_idx = 0;
 
374
  sh.fi = fi;
 
375
  sh.err = err;
 
376
 
 
377
  /* create region */
 
378
  seqid = gt_str_new_cstr(GT_FI_TEST_SEQID);
 
379
  rn = (GtRegionNode*) gt_region_node_new(seqid, GT_FI_TEST_START,
 
380
                                          GT_FI_TEST_END);
 
381
 
 
382
  /* test seqid is not supposed to exist */
 
383
  gt_ensure(had_err, !gt_feature_index_has_seqid(sh.fi, GT_FI_TEST_SEQID));
 
384
 
 
385
  /* add a sequence region directly and check if it has been added */
 
386
  gt_feature_index_add_region_node(sh.fi, rn);
 
387
  gt_genome_node_delete((GtGenomeNode*) rn);
 
388
  gt_ensure(had_err, gt_feature_index_has_seqid(sh.fi, GT_FI_TEST_SEQID));
 
389
 
 
390
  gt_feature_index_get_range_for_seqid(sh.fi, &check_range, GT_FI_TEST_SEQID);
 
391
  gt_ensure(had_err, check_range.start == GT_FI_TEST_START
 
392
                    && check_range.end == GT_FI_TEST_END);
 
393
 
 
394
  /* set up nodes to store */
 
395
  for (i=0;i<GT_FI_TEST_FEATURES_PER_THREAD*gt_jobs;i++) {
 
396
    unsigned long start, end;
 
397
    GtFeatureNode *fn;
 
398
    start = random() % (GT_FI_TEST_END - GT_FI_TEST_FEATURE_WIDTH);
 
399
    end = start + random() % (GT_FI_TEST_FEATURE_WIDTH);
 
400
    fn = gt_feature_node_cast(gt_feature_node_new(seqid, "gene", start, end,
 
401
                                                  GT_STRAND_FORWARD));
 
402
    gt_array_add(sh.nodes, fn);
 
403
  }
 
404
 
 
405
  /* test parallel addition */
 
406
  gt_multithread(gt_feature_index_unit_test_add, &sh, err);
 
407
  seqids = gt_feature_index_get_seqids(fi);
 
408
  gt_ensure(had_err, seqids);
 
409
  gt_ensure(had_err, gt_feature_index_has_seqid(fi, GT_FI_TEST_SEQID));
 
410
  gt_ensure(had_err, gt_str_array_size(seqids) == 1);
 
411
 
 
412
  /* test parallel query */
 
413
  if (!had_err)
 
414
    gt_multithread(gt_feature_index_unit_test_query, &sh, err);
 
415
  gt_ensure(had_err, sh.error_count == 0);
 
416
 
 
417
  gt_mutex_delete(sh.mutex);
 
418
  gt_str_array_delete(seqids);
 
419
  gt_array_delete(sh.nodes);
 
420
  gt_str_delete(seqid);
 
421
  return had_err;
 
422
}