10
KHASH_MAP_INIT_INT(32, listelem_t)
12
#define BLOCK_SIZE 65536
24
static int fill_buf(samfile_t *in, buffer_t *buf)
26
int i, ret, last_tid, min_rpos = 0x7fffffff, capacity;
27
bam1_t *b = bam_init1();
28
bam1_core_t *c = &b->core;
29
// squeeze out the empty cells at the beginning
30
for (i = 0; i < buf->n; ++i)
31
if (buf->buf[i].b) break;
32
if (i < buf->n) { // squeeze
34
memmove(buf->buf, buf->buf + i, sizeof(elem_t) * (buf->n - i));
39
for (i = 0; i < buf->n; ++i) {
40
elem_t *e = buf->buf + i;
41
if (e->b && e->rpos >= 0 && e->rpos < min_rpos)
42
min_rpos = buf->buf[i].rpos;
46
last_tid = buf->n? buf->buf[0].b->core.tid : -1;
47
capacity = buf->n + BLOCK_SIZE;
48
while ((ret = samread(in, b)) >= 0) {
50
uint8_t *qual = bam1_qual(b);
52
if (last_tid < 0) last_tid = c->tid;
53
if (c->tid != last_tid) {
54
if (buf->x < 0) buf->x = buf->n;
56
if (buf->n >= buf->max) { // enlarge
57
buf->max = buf->max? buf->max<<1 : 8;
58
buf->buf = (elem_t*)realloc(buf->buf, sizeof(elem_t) * buf->max);
60
e = &buf->buf[buf->n++];
62
e->rpos = -1; e->score = 0;
63
for (i = 0; i < c->l_qseq; ++i) e->score += qual[i] + 1;
64
e->score = (double)e->score / sqrt(c->l_qseq + 1);
65
is_mapped = (c->tid < 0 || c->tid >= in->header->n_targets || (c->flag&BAM_FUNMAP))? 0 : 1;
66
if (!is_mapped) e->score = -1;
67
if (is_mapped && (c->flag & BAM_FREVERSE)) {
68
e->rpos = b->core.pos + bam_calend(&b->core, bam1_cigar(b));
69
if (min_rpos > e->rpos) min_rpos = e->rpos;
71
if (buf->n >= capacity) {
72
if (is_mapped && c->pos <= min_rpos) capacity += BLOCK_SIZE;
76
if (ret >= 0 && buf->x < 0) buf->x = buf->n;
6
#define QUEUE_CLEAR_SIZE 0x100000
7
#define MAX_POS 0x7fffffff
11
uint32_t score:31, discarded:1;
14
#define __free_elem(p) bam_destroy1((p)->data.b)
15
KLIST_INIT(q, elem_t, __free_elem)
16
typedef klist_t(q) queue_t;
18
KHASH_MAP_INIT_INT(best, elem_p)
19
typedef khash_t(best) besthash_t;
22
uint64_t n_checked, n_removed;
23
besthash_t *left, *rght;
25
KHASH_MAP_INIT_STR(lib, lib_aux_t)
27
static lib_aux_t *get_aux(khash_t(lib) *aux, const char *lib)
29
khint_t k = kh_get(lib, aux, lib);
30
if (k == kh_end(aux)) {
32
char *p = strdup(lib);
34
k = kh_put(lib, aux, p, &ret);
36
q->left = kh_init(best);
37
q->rght = kh_init(best);
38
q->n_checked = q->n_removed = 0;
40
} else return &kh_val(aux, k);
43
static inline int sum_qual(const bam1_t *b)
46
uint8_t *qual = bam1_qual(b);
47
for (i = q = 0; i < b->core.l_qseq; ++i) q += qual[i];
51
static inline elem_t *push_queue(queue_t *queue, const bam1_t *b, int endpos, int score)
53
elem_t *p = kl_pushp(q, queue);
55
p->endpos = endpos; p->score = score;
56
if (p->b == 0) p->b = bam_init1();
61
static void clear_besthash(besthash_t *h, int32_t pos)
64
for (k = kh_begin(h); k != kh_end(h); ++k)
65
if (kh_exist(h, k) && kh_val(h, k)->endpos <= pos)
69
static void dump_alignment(samfile_t *out, queue_t *queue, int32_t pos, khash_t(lib) *h)
71
if (queue->size > QUEUE_CLEAR_SIZE || pos == MAX_POS) {
75
if (queue->head == queue->tail) break;
76
q = &kl_val(queue->head);
79
kl_shift(q, queue, 0);
82
if ((q->b->core.flag&BAM_FREVERSE) && q->endpos > pos) break;
85
kl_shift(q, queue, 0);
87
for (k = kh_begin(h); k != kh_end(h); ++k) {
89
clear_besthash(kh_val(h, k).left, pos);
90
clear_besthash(kh_val(h, k).rght, pos);
96
void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
107
while (samread(in, b) >= 0) {
108
bam1_core_t *c = &b->core;
109
int endpos = bam_calend(c, bam1_cigar(b));
110
int score = sum_qual(b);
112
if (last_tid != c->tid) {
113
if (last_tid >= 0) dump_alignment(out, queue, MAX_POS, aux);
115
} else dump_alignment(out, queue, c->pos, aux);
116
if ((c->flag&BAM_FUNMAP) || ((c->flag&BAM_FPAIRED) && !force_se)) {
117
push_queue(queue, b, endpos, score);
124
lib = bam_get_library(in->header, b);
125
q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
127
h = (c->flag&BAM_FREVERSE)? q->rght : q->left;
128
key = (c->flag&BAM_FREVERSE)? endpos : c->pos;
129
k = kh_put(best, h, key, &ret);
130
if (ret == 0) { // in the hash table
131
elem_t *p = kh_val(h, k);
133
if (p->score < score) {
134
if (c->flag&BAM_FREVERSE) { // mark "discarded" and push the queue
136
kh_val(h, k) = push_queue(queue, b, endpos, score);
138
p->score = score; p->endpos = endpos;
141
} // otherwise, discard the alignment
142
} else kh_val(h, k) = push_queue(queue, b, endpos, score);
145
dump_alignment(out, queue, MAX_POS, aux);
147
for (k = kh_begin(aux); k != kh_end(aux); ++k) {
148
if (kh_exist(aux, k)) {
149
lib_aux_t *q = &kh_val(aux, k);
150
fprintf(stderr, "[bam_rmdupse_core] %lld / %lld = %.4lf in library '%s'\n", (long long)q->n_removed,
151
(long long)q->n_checked, (double)q->n_removed/q->n_checked, kh_key(aux, k));
152
kh_destroy(best, q->left); kh_destroy(best, q->rght);
153
free((char*)kh_key(aux, k));
156
kh_destroy(lib, aux);
81
static void rmdupse_buf(buffer_t *buf)
89
mpos = (buf->x == buf->n)? buf->buf[buf->x-1].b->core.pos : 0x7fffffff;
90
upper = (buf->x < 0)? buf->n : buf->x;
91
// fill the hash table
93
for (i = 0; i < upper; ++i) {
94
elem_t *e = buf->buf + i;
96
if (e->score < 0) continue;
98
if (e->rpos <= mpos) key = (uint32_t)e->rpos<<1 | 1;
101
if (e->b->core.pos < mpos) key = (uint32_t)e->b->core.pos<<1;
104
k = kh_put(32, h, key, &ret);
106
if (ret == 0) { // present in the hash table
109
p->a = (int*)realloc(p->a, p->m * sizeof(int));
114
p->a = (int*)calloc(p->m, sizeof(int));
119
for (k = kh_begin(h); k < kh_end(h); ++k) {
120
if (kh_exist(h, k)) {
124
for (i = max = 0, maxi = -1; i < p->n; ++i) {
125
if (buf->buf[p->a[i]].score > max) {
126
max = buf->buf[p->a[i]].score;
131
for (i = 0; i < p->n; ++i) {
132
buf->buf[p->a[i]].score = -1;
134
bam_destroy1(buf->buf[p->a[i]].b);
135
buf->buf[p->a[i]].b = 0;
145
static void dump_buf(buffer_t *buf, samfile_t *out)
148
for (i = 0; i < buf->n; ++i) {
149
elem_t *e = buf->buf + i;
150
if (e->score != -1) break;
159
int bam_rmdupse(int argc, char *argv[])
164
fprintf(stderr, "Usage: samtools rmdupse <in.bam> <out.bam>\n\n");
165
fprintf(stderr, "Note: Picard is recommended for this task.\n");
168
buf = calloc(1, sizeof(buffer_t));
169
in = samopen(argv[1], "rb", 0);
170
out = samopen(argv[2], "wb", in->header);
171
while (fill_buf(in, buf)) {
175
samclose(in); samclose(out);
176
free(buf->buf); free(buf);
158
kl_destroy(q, queue);