~ubuntu-branches/ubuntu/raring/samtools/raring

« back to all changes in this revision

Viewing changes to bam_rmdupse.c

  • Committer: Bazaar Package Importer
  • Author(s): Charles Plessy
  • Date: 2009-11-17 21:38:24 UTC
  • Revision ID: james.westby@ubuntu.com-20091117213824-dfouynpy3r7ismpj
Tags: 0.1.7a~dfsg-1
* New upstream release: new script sam2vcf.pl, and many other changes.
* Package converted to the format ‘3.0 (quilt)’ (debian/source/format).
* Wrote a manual page for razip (debian/razip.1).
* Better clean the example directory to make the source package
  buildable twice in a row (debian/rules).

Show diffs side-by-side

added added

removed removed

Lines of Context:
1
1
#include <math.h>
2
2
#include "sam.h"
3
3
#include "khash.h"
4
 
 
5
 
typedef struct {
6
 
        int n, m;
7
 
        int *a;
8
 
} listelem_t;
9
 
 
10
 
KHASH_MAP_INIT_INT(32, listelem_t)
11
 
 
12
 
#define BLOCK_SIZE 65536
13
 
 
14
 
typedef struct {
15
 
        bam1_t *b;
16
 
        int rpos, score;
17
 
} elem_t;
18
 
 
19
 
typedef struct {
20
 
        int n, max, x;
21
 
        elem_t *buf;
22
 
} buffer_t;
23
 
 
24
 
static int fill_buf(samfile_t *in, buffer_t *buf)
25
 
{
26
 
        int i, ret, last_tid, min_rpos = 0x7fffffff, capacity;
27
 
        bam1_t *b = bam_init1();
28
 
        bam1_core_t *c = &b->core;
29
 
        // squeeze out the empty cells at the beginning
30
 
        for (i = 0; i < buf->n; ++i)
31
 
                if (buf->buf[i].b) break;
32
 
        if (i < buf->n) { // squeeze
33
 
                if (i > 0) {
34
 
                        memmove(buf->buf, buf->buf + i, sizeof(elem_t) * (buf->n - i));
35
 
                        buf->n = buf->n - i;
36
 
                }
37
 
        } else buf->n = 0;
38
 
        // calculate min_rpos
39
 
        for (i = 0; i < buf->n; ++i) {
40
 
                elem_t *e = buf->buf + i;
41
 
                if (e->b && e->rpos >= 0 && e->rpos < min_rpos)
42
 
                        min_rpos = buf->buf[i].rpos;
43
 
        }
44
 
        // fill the buffer
45
 
        buf->x = -1;
46
 
        last_tid = buf->n? buf->buf[0].b->core.tid : -1;
47
 
        capacity = buf->n + BLOCK_SIZE;
48
 
        while ((ret = samread(in, b)) >= 0) {
49
 
                elem_t *e;
50
 
                uint8_t *qual = bam1_qual(b);
51
 
                int is_mapped;
52
 
                if (last_tid < 0) last_tid = c->tid;
53
 
                if (c->tid != last_tid) {
54
 
                        if (buf->x < 0) buf->x = buf->n;
55
 
                }
56
 
                if (buf->n >= buf->max) { // enlarge
57
 
                        buf->max = buf->max? buf->max<<1 : 8;
58
 
                        buf->buf = (elem_t*)realloc(buf->buf, sizeof(elem_t) * buf->max);
59
 
                }
60
 
                e = &buf->buf[buf->n++];
61
 
                e->b = bam_dup1(b);
62
 
                e->rpos = -1; e->score = 0;
63
 
                for (i = 0; i < c->l_qseq; ++i) e->score += qual[i] + 1;
64
 
                e->score = (double)e->score / sqrt(c->l_qseq + 1);
65
 
                is_mapped = (c->tid < 0 || c->tid >= in->header->n_targets || (c->flag&BAM_FUNMAP))? 0 : 1;
66
 
                if (!is_mapped) e->score = -1;
67
 
                if (is_mapped && (c->flag & BAM_FREVERSE)) {
68
 
                        e->rpos = b->core.pos + bam_calend(&b->core, bam1_cigar(b));
69
 
                        if (min_rpos > e->rpos) min_rpos = e->rpos;
70
 
                }
71
 
                if (buf->n >= capacity) {
72
 
                        if (is_mapped && c->pos <= min_rpos) capacity += BLOCK_SIZE;
73
 
                        else break;
74
 
                }
75
 
        }
76
 
        if (ret >= 0 && buf->x < 0) buf->x = buf->n;
 
4
#include "klist.h"
 
5
 
 
6
#define QUEUE_CLEAR_SIZE 0x100000
 
7
#define MAX_POS 0x7fffffff
 
8
 
 
9
typedef struct {
 
10
        int endpos;
 
11
        uint32_t score:31, discarded:1;
 
12
        bam1_t *b;
 
13
} elem_t, *elem_p;
 
14
#define __free_elem(p) bam_destroy1((p)->data.b)
 
15
KLIST_INIT(q, elem_t, __free_elem)
 
16
typedef klist_t(q) queue_t;
 
17
 
 
18
KHASH_MAP_INIT_INT(best, elem_p)
 
19
typedef khash_t(best) besthash_t;
 
20
 
 
21
typedef struct {
 
22
        uint64_t n_checked, n_removed;
 
23
        besthash_t *left, *rght;
 
24
} lib_aux_t;
 
25
KHASH_MAP_INIT_STR(lib, lib_aux_t)
 
26
 
 
27
static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib)
 
28
{
 
29
        khint_t k = kh_get(lib, aux, lib);
 
30
        if (k == kh_end(aux)) {
 
31
                int ret;
 
32
                char *p = strdup(lib);
 
33
                lib_aux_t *q;
 
34
                k = kh_put(lib, aux, p, &ret);
 
35
                q = &kh_val(aux, k);
 
36
                q->left = kh_init(best);
 
37
                q->rght = kh_init(best);
 
38
                q->n_checked = q->n_removed = 0;
 
39
                return q;
 
40
        } else return &kh_val(aux, k);
 
41
}
 
42
 
 
43
static inline int sum_qual(const bam1_t *b)
 
44
{
 
45
        int i, q;
 
46
        uint8_t *qual = bam1_qual(b);
 
47
        for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i];
 
48
        return q;
 
49
}
 
50
 
 
51
static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, int score)
 
52
{
 
53
        elem_t *p = kl_pushp(q, queue);
 
54
        p->discarded = 0;
 
55
        p->endpos = endpos; p->score = score;
 
56
        if (p->b == 0) p->b = bam_init1();
 
57
        bam_copy1(p->b, b);
 
58
        return p;
 
59
}
 
60
 
 
61
static void clear_besthash(besthash_t *h, int32_t pos)
 
62
{
 
63
        khint_t k;
 
64
        for (k = kh_begin(h); k != kh_end(h); ++k)
 
65
                if (kh_exist(h, k) && kh_val(h, k)->endpos <= pos)
 
66
                        kh_del(best, h, k);
 
67
}
 
68
 
 
69
static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h)
 
70
{
 
71
        if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) {
 
72
                khint_t k;
 
73
                while (1) {
 
74
                        elem_t *q;
 
75
                        if (queue->head == queue->tail) break;
 
76
                        q = &kl_val(queue->head);
 
77
                        if (q->discarded) {
 
78
                                q->b->data_len = 0;
 
79
                                kl_shift(q, queue, 0);
 
80
                                continue;
 
81
                        }
 
82
                        if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break;
 
83
                        samwrite(out, q->b);
 
84
                        q->b->data_len = 0;
 
85
                        kl_shift(q, queue, 0);
 
86
                }
 
87
                for (k = kh_begin(h); k != kh_end(h); ++k) {
 
88
                        if (kh_exist(h, k)) {
 
89
                                clear_besthash(kh_val(h, k).left, pos);
 
90
                                clear_besthash(kh_val(h, k).rght, pos);
 
91
                        }
 
92
                }
 
93
        }
 
94
}
 
95
 
 
96
void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
 
97
{
 
98
        bam1_t *b;
 
99
        queue_t *queue;
 
100
        khint_t k;
 
101
        int last_tid = -2;
 
102
        khash_t(lib) *aux;
 
103
 
 
104
        aux = kh_init(lib);
 
105
        b = bam_init1();
 
106
        queue = kl_init(q);
 
107
        while (samread(in, b) >= 0) {
 
108
                bam1_core_t *c = &b->core;
 
109
                int endpos = bam_calend(c, bam1_cigar(b));
 
110
                int score = sum_qual(b);
 
111
                
 
112
                if (last_tid != c->tid) {
 
113
                        if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux);
 
114
                        last_tid = c->tid;
 
115
                } else dump_alignment(out, queue, c->pos, aux);
 
116
                if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) {
 
117
                        push_queue(queue, b, endpos, score);
 
118
                } else {
 
119
                        const char *lib;
 
120
                        lib_aux_t *q;
 
121
                        besthash_t *h;
 
122
                        uint32_t key;
 
123
                        int ret;
 
124
                        lib = bam_get_library(in->header, b);
 
125
                        q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
 
126
                        ++q->n_checked;
 
127
                        h = (c->flag&BAM_FREVERSE)? q->rght : q->left;
 
128
                        key = (c->flag&BAM_FREVERSE)? endpos : c->pos;
 
129
                        k = kh_put(best, h, key, &ret);
 
130
                        if (ret == 0) { // in the hash table
 
131
                                elem_t *p = kh_val(h, k);
 
132
                                ++q->n_removed;
 
133
                                if (p->score < score) {
 
134
                                        if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue
 
135
                                                p->discarded = 1;
 
136
                                                kh_val(h, k) = push_queue(queue, b, endpos, score);
 
137
                                        } else { // replace
 
138
                                                p->score = score; p->endpos = endpos;
 
139
                                                bam_copy1(p->b, b);
 
140
                                        }
 
141
                                } // otherwise, discard the alignment
 
142
                        } else kh_val(h, k) = push_queue(queue, b, endpos, score);
 
143
                }
 
144
        }
 
145
        dump_alignment(out, queue, MAX_POS, aux);
 
146
 
 
147
        for (k = kh_begin(aux); k != kh_end(aux); ++k) {
 
148
                if (kh_exist(aux, k)) {
 
149
                        lib_aux_t *q = &kh_val(aux, k);
 
150
                        fprintf(stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
 
151
                                        (long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
 
152
                        kh_destroy(best, q->left); kh_destroy(best, q->rght);
 
153
                        free((char*)kh_key(aux, k));
 
154
                }
 
155
        }
 
156
        kh_destroy(lib, aux);
77
157
        bam_destroy1(b);
78
 
        return buf->n;
79
 
}
80
 
 
81
 
static void rmdupse_buf(buffer_t *buf)
82
 
{
83
 
        khash_t(32) *h;
84
 
        uint32_t key;
85
 
        khint_t k;
86
 
        int mpos, i, upper;
87
 
        listelem_t *p;
88
 
        mpos = 0x7fffffff;
89
 
        mpos = (buf->x == buf->n)? buf->buf[buf->x-1].b->core.pos : 0x7fffffff;
90
 
        upper = (buf->x < 0)? buf->n : buf->x;
91
 
        // fill the hash table
92
 
        h = kh_init(32);
93
 
        for (i = 0; i < upper; ++i) {
94
 
                elem_t *e = buf->buf + i;
95
 
                int ret;
96
 
                if (e->score < 0) continue;
97
 
                if (e->rpos >= 0) {
98
 
                        if (e->rpos <= mpos) key = (uint32_t)e->rpos<<1 | 1;
99
 
                        else continue;
100
 
                } else {
101
 
                        if (e->b->core.pos < mpos) key = (uint32_t)e->b->core.pos<<1;
102
 
                        else continue;
103
 
                }
104
 
                k = kh_put(32, h, key, &ret);
105
 
                p = &kh_val(h, k);
106
 
                if (ret == 0) { // present in the hash table
107
 
                        if (p->n == p->m) {
108
 
                                p->m <<= 1;
109
 
                                p->a = (int*)realloc(p->a, p->m * sizeof(int));
110
 
                        }
111
 
                        p->a[p->n++] = i;
112
 
                } else {
113
 
                        p->m = p->n = 1;
114
 
                        p->a = (int*)calloc(p->m, sizeof(int));
115
 
                        p->a[0] = i;
116
 
                }
117
 
        }
118
 
        // rmdup
119
 
        for (k = kh_begin(h); k < kh_end(h); ++k) {
120
 
                if (kh_exist(h, k)) {
121
 
                        int max, maxi;
122
 
                        p = &kh_val(h, k);
123
 
                        // get the max
124
 
                        for (i = max = 0, maxi = -1; i < p->n; ++i) {
125
 
                                if (buf->buf[p->a[i]].score > max) {
126
 
                                        max = buf->buf[p->a[i]].score;
127
 
                                        maxi = i;
128
 
                                }
129
 
                        }
130
 
                        // mark the elements
131
 
                        for (i = 0; i < p->n; ++i) {
132
 
                                buf->buf[p->a[i]].score = -1;
133
 
                                if (i != maxi) {
134
 
                                        bam_destroy1(buf->buf[p->a[i]].b);
135
 
                                        buf->buf[p->a[i]].b = 0;
136
 
                                }
137
 
                        }
138
 
                        // free
139
 
                        free(p->a);
140
 
                }
141
 
        }
142
 
        kh_destroy(32, h);
143
 
}
144
 
 
145
 
static void dump_buf(buffer_t *buf, samfile_t *out)
146
 
{
147
 
        int i;
148
 
        for (i = 0; i < buf->n; ++i) {
149
 
                elem_t *e = buf->buf + i;
150
 
                if (e->score != -1) break;
151
 
                if (e->b) {
152
 
                        samwrite(out, e->b);
153
 
                        bam_destroy1(e->b);
154
 
                        e->b = 0;
155
 
                }
156
 
        }
157
 
}
158
 
 
159
 
int bam_rmdupse(int argc, char *argv[])
160
 
{
161
 
        samfile_t *in, *out;
162
 
        buffer_t *buf;
163
 
        if (argc < 3) {
164
 
                fprintf(stderr, "Usage: samtools rmdupse <in.bam> <out.bam>\n\n");
165
 
                fprintf(stderr, "Note: Picard is recommended for this task.\n");
166
 
                return 1;
167
 
        }
168
 
        buf = calloc(1, sizeof(buffer_t));
169
 
        in = samopen(argv[1], "rb", 0);
170
 
        out = samopen(argv[2], "wb", in->header);
171
 
        while (fill_buf(in, buf)) {
172
 
                rmdupse_buf(buf);
173
 
                dump_buf(buf, out);
174
 
        }
175
 
        samclose(in); samclose(out);
176
 
        free(buf->buf); free(buf);
177
 
        return 0;
 
158
        kl_destroy(q, queue);
178
159
}