1
#include "GenomicRanges.h"
2
#include "IRanges_interface.h"
4
#include <ctype.h> /* for isdigit() */
6
static char errmsg_buf[200];
8
/* Return the number of chars that was read, or 0 if there is no more char
9
to read (i.e. cig0[offset] is '\0'), or -1 in case of a parse error.
10
Zero-length operations are ignored. */
11
static int get_next_cigar_OP(const char *cig0, int offset,
23
while (isdigit(c = cig0[offset])) {
29
if (!(*OP = cig0[offset])) {
30
snprintf(errmsg_buf, sizeof(errmsg_buf),
31
"unexpected CIGAR end at char %d",
38
return offset - offset0;
41
/* Return the number of chars that was read, or 0 if there is no more char
42
to read (i.e. offset is 0), or -1 in case of a parse error.
43
Zero-length operations are ignored. */
44
static int get_prev_cigar_OP(const char *cig0, int offset,
48
int offset0, opl, powof10;
59
snprintf(errmsg_buf, sizeof(errmsg_buf),
60
"no CIGAR operation length at char %d",
67
while (offset >= 0 && isdigit(c = cig0[offset])) {
68
opl += (c - '0') * powof10;
75
return offset0 - offset;
78
static const char *cigar_string_op_table(SEXP cigar_string, const char *allOPs,
79
int *table_row, int table_nrow)
81
const char *cig0, *tmp;
82
int offset, n, OPL /* Operation Length */;
83
char OP /* Operation */;
85
if (cigar_string == NA_STRING)
86
return "CIGAR string is NA";
87
if (LENGTH(cigar_string) == 0)
88
return "CIGAR string is empty";
89
cig0 = CHAR(cigar_string);
91
while ((n = get_next_cigar_OP(cig0, offset, &OPL, &OP))) {
94
tmp = strchr(allOPs, (int) OP);
96
snprintf(errmsg_buf, sizeof(errmsg_buf),
97
"unknown CIGAR operation '%c' at char %d",
101
*(table_row + (tmp - allOPs) * table_nrow) += OPL;
107
static const char *cigar_string_to_qwidth(SEXP cigar_string, int clip_reads,
111
int offset, n, OPL /* Operation Length */;
112
char OP /* Operation */;
114
if (cigar_string == NA_STRING)
115
return "CIGAR string is NA";
116
if (LENGTH(cigar_string) == 0)
117
return "CIGAR string is empty";
118
cig0 = CHAR(cigar_string);
119
*qwidth = offset = 0;
120
while ((n = get_next_cigar_OP(cig0, offset, &OPL, &OP))) {
124
/* Alignment match (can be a sequence match or mismatch) */
125
case 'M': case '=': case 'X': *qwidth += OPL; break;
126
/* Insertion to the reference */
127
case 'I': *qwidth += OPL; break;
128
/* Deletion (or skipped region) from the reference,
129
or silent deletion from the padded reference */
130
case 'D': case 'N': case 'P': break;
131
/* Soft clip on the read */
132
case 'S': *qwidth += OPL; break;
133
/* Hard clip on the read */
134
case 'H': if (!clip_reads) *qwidth += OPL; break;
136
snprintf(errmsg_buf, sizeof(errmsg_buf),
137
"unknown CIGAR operation '%c' at char %d",
146
static const char *cigar_string_to_width(SEXP cigar_string, int *width)
149
int offset, n, OPL /* Operation Length */;
150
char OP /* Operation */;
152
if (cigar_string == NA_STRING)
153
return "CIGAR string is NA";
154
if (LENGTH(cigar_string) == 0)
155
return "CIGAR string is empty";
156
cig0 = CHAR(cigar_string);
158
while ((n = get_next_cigar_OP(cig0, offset, &OPL, &OP))) {
162
/* Alignment match (can be a sequence match or mismatch) */
163
case 'M': case '=': case 'X': *width += OPL; break;
164
/* Insertion to the reference */
166
/* Deletion (or skipped region) from the reference */
167
case 'D': case 'N': *width += OPL; break;
168
/* Soft/Hard clip on the read */
169
case 'S': case 'H': break;
170
/* Silent deletion from the padded reference */
173
snprintf(errmsg_buf, sizeof(errmsg_buf),
174
"unknown CIGAR operation '%c' at char %d",
183
static const char *Lqnarrow_cigar_string(SEXP cigar_string,
184
int *Lqwidth, int *Loffset, int *rshift)
187
int offset, n, OPL /* Operation Length */;
188
char OP /* Operation */;
190
if (cigar_string == NA_STRING)
191
return "CIGAR string is NA";
192
if (LENGTH(cigar_string) == 0)
193
return "CIGAR string is empty";
194
cig0 = CHAR(cigar_string);
195
*rshift = offset = 0;
196
while ((n = get_next_cigar_OP(cig0, offset, &OPL, &OP))) {
200
/* Alignment match (can be a sequence match or mismatch) */
201
case 'M': case '=': case 'X':
202
if (*Lqwidth < OPL) {
210
/* Insertion to the reference or soft/hard clip on the read */
211
case 'I': case 'S': case 'H':
212
if (*Lqwidth < OPL) {
218
/* Deletion (or skipped region) from the reference */
222
/* Silent deletion from the padded reference */
225
snprintf(errmsg_buf, sizeof(errmsg_buf),
226
"unknown CIGAR operation '%c' at char %d",
232
snprintf(errmsg_buf, sizeof(errmsg_buf),
233
"CIGAR is empty after qnarrowing");
237
static const char *Rqnarrow_cigar_string(SEXP cigar_string,
238
int *Rqwidth, int *Roffset)
241
int offset, n, OPL /* Operation Length */;
242
char OP /* Operation */;
244
if (cigar_string == NA_STRING)
245
return "CIGAR string is NA";
246
if (LENGTH(cigar_string) == 0)
247
return "CIGAR string is empty";
248
cig0 = CHAR(cigar_string);
249
offset = LENGTH(cigar_string);
250
while ((n = get_prev_cigar_OP(cig0, offset, &OPL, &OP))) {
255
/* M, =, X, I, S, H */
256
case 'M': case '=': case 'X': case 'I': case 'S': case 'H':
257
if (*Rqwidth < OPL) {
263
/* Deletion (or skipped region) from the reference,
264
or silent deletion from the padded reference */
265
case 'D': case 'N': case 'P':
268
snprintf(errmsg_buf, sizeof(errmsg_buf),
269
"unknown CIGAR operation '%c' at char %d",
274
snprintf(errmsg_buf, sizeof(errmsg_buf),
275
"CIGAR is empty after qnarrowing");
279
/* FIXME: 'cigar_buf' is under the risk of a buffer overflow! */
280
static const char *qnarrow_cigar_string(SEXP cigar_string,
281
int Lqwidth, int Rqwidth, char *cigar_buf, int *rshift)
283
int Loffset, Roffset, buf_offset;
285
int offset, n, OPL /* Operation Length */;
286
char OP /* Operation */;
289
//Rprintf("qnarrow_cigar_string():\n");
290
errmsg = Lqnarrow_cigar_string(cigar_string, &Lqwidth, &Loffset,
294
//Rprintf(" Lqwidth=%d Loffset=%d *rshift=%d\n",
295
// Lqwidth, Loffset, *rshift);
296
errmsg = Rqnarrow_cigar_string(cigar_string, &Rqwidth, &Roffset);
299
//Rprintf(" Rqwidth=%d Roffset=%d\n", Rqwidth, Roffset);
300
if (Roffset < Loffset) {
301
snprintf(errmsg_buf, sizeof(errmsg_buf),
302
"CIGAR is empty after qnarrowing");
306
cig0 = CHAR(cigar_string);
307
for (offset = Loffset; offset <= Roffset; offset += n) {
308
n = get_next_cigar_OP(cig0, offset, &OPL, &OP);
309
if (offset == Loffset)
311
if (offset == Roffset)
314
snprintf(errmsg_buf, sizeof(errmsg_buf),
315
"CIGAR is empty after qnarrowing");
318
buf_offset += sprintf(cigar_buf + buf_offset,
324
static const char *Lnarrow_cigar_string(SEXP cigar_string,
325
int *Lwidth, int *Loffset, int *rshift)
328
int offset, n, OPL /* Operation Length */;
329
char OP /* Operation */;
331
if (cigar_string == NA_STRING)
332
return "CIGAR string is NA";
333
if (LENGTH(cigar_string) == 0)
334
return "CIGAR string is empty";
335
cig0 = CHAR(cigar_string);
336
*rshift = offset = 0;
337
while ((n = get_next_cigar_OP(cig0, offset, &OPL, &OP))) {
341
/* Alignment match (can be a sequence match or mismatch) */
342
case 'M': case '=': case 'X':
351
/* Insertion to the reference or soft/hard clip on the read */
352
case 'I': case 'S': case 'H':
354
/* Deletion (or skipped region) from the reference */
362
/* Silent deletion from the padded reference */
365
snprintf(errmsg_buf, sizeof(errmsg_buf),
366
"unknown CIGAR operation '%c' at char %d",
372
snprintf(errmsg_buf, sizeof(errmsg_buf),
373
"CIGAR is empty after narrowing");
377
static const char *Rnarrow_cigar_string(SEXP cigar_string,
378
int *Rwidth, int *Roffset)
381
int offset, n, OPL /* Operation Length */;
382
char OP /* Operation */;
384
if (cigar_string == NA_STRING)
385
return "CIGAR string is NA";
386
if (LENGTH(cigar_string) == 0)
387
return "CIGAR string is empty";
388
cig0 = CHAR(cigar_string);
389
offset = LENGTH(cigar_string);
390
while ((n = get_prev_cigar_OP(cig0, offset, &OPL, &OP))) {
395
/* Alignment match (can be a sequence match or mismatch) */
396
case 'M': case '=': case 'X':
403
/* Insertion to the reference or soft/hard clip on the read */
404
case 'I': case 'S': case 'H':
406
/* Deletion (or skipped region) from the reference */
413
/* Silent deletion from the padded reference */
416
snprintf(errmsg_buf, sizeof(errmsg_buf),
417
"unknown CIGAR operation '%c' at char %d",
422
snprintf(errmsg_buf, sizeof(errmsg_buf),
423
"CIGAR is empty after narrowing");
427
/* FIXME: 'cigar_buf' is under the risk of a buffer overflow! */
428
static const char *narrow_cigar_string(SEXP cigar_string,
429
int Lwidth, int Rwidth, char *cigar_buf, int *rshift)
431
int Loffset, Roffset, buf_offset;
433
int offset, n, OPL /* Operation Length */;
434
char OP /* Operation */;
437
//Rprintf("narrow_cigar_string():\n");
438
errmsg = Lnarrow_cigar_string(cigar_string, &Lwidth, &Loffset,
442
//Rprintf(" Lwidth=%d Loffset=%d *rshift=%d\n",
443
// Lwidth, Loffset, *rshift);
444
errmsg = Rnarrow_cigar_string(cigar_string, &Rwidth, &Roffset);
447
//Rprintf(" Rwidth=%d Roffset=%d\n", Rwidth, Roffset);
448
if (Roffset < Loffset) {
449
snprintf(errmsg_buf, sizeof(errmsg_buf),
450
"CIGAR is empty after narrowing");
454
cig0 = CHAR(cigar_string);
455
for (offset = Loffset; offset <= Roffset; offset += n) {
456
n = get_next_cigar_OP(cig0, offset, &OPL, &OP);
457
if (offset == Loffset)
459
if (offset == Roffset)
462
snprintf(errmsg_buf, sizeof(errmsg_buf),
463
"CIGAR is empty after narrowing");
466
buf_offset += sprintf(cigar_buf + buf_offset,
472
static const char *split_cigar_string(SEXP cigar_string,
473
CharAE *OPbuf, IntAE *OPLbuf)
476
int offset, n, OPL /* Operation Length */;
477
char OP /* Operation */;
479
cig0 = CHAR(cigar_string);
481
while ((n = get_next_cigar_OP(cig0, offset, &OPL, &OP))) {
484
CharAE_insert_at(OPbuf, CharAE_get_nelt(OPbuf), OP);
485
IntAE_insert_at(OPLbuf, IntAE_get_nelt(OPLbuf), OPL);
491
static void drop_append_or_merge_range(RangeAE *range_ae, int start, int width,
492
int drop_empty_range, int merge_range, int nelt0)
494
int nelt, prev_end_plus_1;
496
if (drop_empty_range && width == 0)
498
if (merge_range && (nelt = RangeAE_get_nelt(range_ae)) > nelt0) {
499
/* The incoming range should never overlap with the previous
500
incoming range i.e. 'start' should always be > the end of
501
the previous incoming range. */
503
prev_end_plus_1 = range_ae->start.elts[nelt] +
504
range_ae->width.elts[nelt];
505
if (start == prev_end_plus_1) {
506
range_ae->width.elts[nelt] += width;
510
RangeAE_insert_at(range_ae, RangeAE_get_nelt(range_ae), start, width);
515
* Only the M, =, X, I, and D CIGAR operations produce ranges (1 range per
516
* operation). The I operation always produces an empty range. The D operation
517
* only produces a range if Ds_as_Ns is FALSE.
519
static const char *cigar_string_to_ranges(SEXP cigar_string, int pos_elt,
520
int Ds_as_Ns, int drop_empty_ranges, int reduce_ranges,
524
int out_nelt0, offset, n, OPL /* Operation Length */, start, width;
525
char OP /* Operation */;
527
cig0 = CHAR(cigar_string);
528
out_nelt0 = RangeAE_get_nelt(out);
531
while ((n = get_next_cigar_OP(cig0, offset, &OPL, &OP))) {
536
/* Alignment match (can be a sequence match or mismatch) */
537
case 'M': case '=': case 'X': width = OPL; break;
538
/* Insertion to the reference */
539
case 'I': width = 0; break;
540
/* Deletion from the reference */
547
/* Skipped region from the reference */
548
case 'N': start += OPL; break;
549
/* Soft/hard clip on the read */
550
case 'S': case 'H': break;
551
/* Silent deletion from the padded reference */
554
snprintf(errmsg_buf, sizeof(errmsg_buf),
555
"unknown CIGAR operation '%c' at char %d",
560
drop_append_or_merge_range(out, start, width,
562
reduce_ranges, out_nelt0);
571
/****************************************************************************
572
* --- .Call ENTRY POINT ---
574
* cigar: character vector containing the extended CIGAR string for each
576
* ans_type: a single integer specifying the type of answer to return:
577
* 0: 'ans' is a string describing the first validity failure or NULL;
578
* 1: 'ans' is logical vector with TRUE values for valid elements
581
SEXP valid_cigar(SEXP cigar, SEXP ans_type)
583
SEXP ans, cigar_string;
584
int cigar_length, ans_type0, i, qwidth;
586
char string_buf[200];
588
cigar_length = LENGTH(cigar);
589
ans_type0 = INTEGER(ans_type)[0];
591
PROTECT(ans = NEW_LOGICAL(cigar_length));
594
for (i = 0; i < cigar_length; i++) {
595
cigar_string = STRING_ELT(cigar, i);
596
/* we use cigar_string_to_qwidth() here just for its ability
597
to parse and detect ill-formed CIGAR strings */
598
errmsg = cigar_string_to_qwidth(cigar_string, 1, &qwidth);
599
if (ans_type0 == 1) {
600
LOGICAL(ans)[i] = errmsg == NULL;
603
if (errmsg != NULL) {
604
snprintf(string_buf, sizeof(string_buf),
605
"element %d is invalid (%s)", i + 1, errmsg);
606
return mkString(string_buf);
615
/****************************************************************************
616
* --- .Call ENTRY POINT ---
618
* cigar: character vector containing the extended CIGAR string for each
620
* Return a list of the same length as 'cigar' where each element is itself
621
* a list with 2 elements of the same lengths, the 1st one being a raw
622
* vector containing the CIGAR operations and the 2nd one being an integer
623
* vector containing the lengths of the CIGAR operations.
625
SEXP split_cigar(SEXP cigar)
627
SEXP ans, cigar_string, ans_elt, ans_elt_elt0, ans_elt_elt1;
633
cigar_length = LENGTH(cigar);
634
PROTECT(ans = NEW_LIST(cigar_length));
635
OPbuf = new_CharAE(0);
636
OPLbuf = new_IntAE(0, 0, 0);
637
for (i = 0; i < cigar_length; i++) {
638
cigar_string = STRING_ELT(cigar, i);
639
if (cigar_string == NA_STRING) {
641
error("'cigar' contains NAs");
643
CharAE_set_nelt(&OPbuf, 0);
644
IntAE_set_nelt(&OPLbuf, 0);
645
errmsg = split_cigar_string(cigar_string, &OPbuf, &OPLbuf);
646
if (errmsg != NULL) {
648
error("in 'cigar' element %d: %s", i + 1, errmsg);
650
PROTECT(ans_elt = NEW_LIST(2));
651
PROTECT(ans_elt_elt0 = new_RAW_from_CharAE(&OPbuf));
652
PROTECT(ans_elt_elt1 = new_INTEGER_from_IntAE(&OPLbuf));
653
SET_VECTOR_ELT(ans_elt, 0, ans_elt_elt0);
654
SET_VECTOR_ELT(ans_elt, 1, ans_elt_elt1);
655
SET_VECTOR_ELT(ans, i, ans_elt);
663
/****************************************************************************
664
* --- .Call ENTRY POINT ---
666
* cigar: character vector containing the extended CIGAR string for each
668
* Return an integer matrix with the number of rows equal to the length of
669
* 'cigar' and 7 columns, one for each extended CIGAR operation containing
670
* a frequency count for the operations for each element of 'cigar'.
672
SEXP cigar_op_table(SEXP cigar)
674
SEXP cigar_string, ans, ans_dimnames, ans_colnames;
675
int cigar_length, allOPs_length, i, j, *ans_row;
676
const char *allOPs = "MIDNSHP", *errmsg;
679
cigar_length = LENGTH(cigar);
680
allOPs_length = strlen(allOPs);
681
PROTECT(ans = allocMatrix(INTSXP, cigar_length, allOPs_length));
682
memset(INTEGER(ans), 0, LENGTH(ans) * sizeof(int));
683
ans_row = INTEGER(ans);
684
for (i = 0, ans_row = INTEGER(ans); i < cigar_length; i++, ans_row++) {
685
cigar_string = STRING_ELT(cigar, i);
686
if (cigar_string == NA_STRING) {
687
INTEGER(ans)[i] = NA_INTEGER;
690
errmsg = cigar_string_op_table(cigar_string, allOPs,
691
ans_row, cigar_length);
692
if (errmsg != NULL) {
694
error("in 'cigar' element %d: %s", i + 1, errmsg);
698
PROTECT(ans_colnames = NEW_CHARACTER(7));
700
for (j = 0; j < allOPs_length; j++) {
701
OPstrbuf[0] = allOPs[j];
702
SET_STRING_ELT(ans_colnames, j, mkChar(OPstrbuf));
704
PROTECT(ans_dimnames = NEW_LIST(2));
705
SET_ELEMENT(ans_dimnames, 0, R_NilValue);
706
SET_ELEMENT(ans_dimnames, 1, ans_colnames);
707
SET_DIMNAMES(ans, ans_dimnames);
713
/****************************************************************************
714
* --- .Call ENTRY POINT ---
716
* cigar: character vector containing the extended CIGAR string for each
718
* before_hard_clipping: TRUE or FALSE indicating whether the returned
719
* widths should be those of the reads before or after "hard
721
* Return an integer vector of the same length as 'cigar' containing the
722
* lengths of the query sequences as inferred from the cigar information.
724
SEXP cigar_to_qwidth(SEXP cigar, SEXP before_hard_clipping)
726
SEXP ans, cigar_string;
727
int clip_reads, cigar_length, i, qwidth;
730
clip_reads = !LOGICAL(before_hard_clipping)[0];
731
cigar_length = LENGTH(cigar);
732
PROTECT(ans = NEW_INTEGER(cigar_length));
733
for (i = 0; i < cigar_length; i++) {
734
cigar_string = STRING_ELT(cigar, i);
735
if (cigar_string == NA_STRING) {
736
INTEGER(ans)[i] = NA_INTEGER;
739
errmsg = cigar_string_to_qwidth(cigar_string, clip_reads,
741
if (errmsg != NULL) {
743
error("in 'cigar' element %d: %s", i + 1, errmsg);
745
INTEGER(ans)[i] = qwidth;
752
/****************************************************************************
753
* --- .Call ENTRY POINT ---
755
* cigar: character vector containing the extended CIGAR string for each
757
* Return an integer vector of the same length as 'cigar' containing the
758
* widths of the alignments as inferred from the cigar information.
760
SEXP cigar_to_width(SEXP cigar)
762
SEXP ans, cigar_string;
763
int cigar_length, i, width;
766
cigar_length = LENGTH(cigar);
767
PROTECT(ans = NEW_INTEGER(cigar_length));
768
for (i = 0; i < cigar_length; i++) {
769
cigar_string = STRING_ELT(cigar, i);
770
if (cigar_string == NA_STRING) {
771
INTEGER(ans)[i] = NA_INTEGER;
774
errmsg = cigar_string_to_width(cigar_string, &width);
775
if (errmsg != NULL) {
777
error("in 'cigar' element %d: %s", i + 1, errmsg);
779
INTEGER(ans)[i] = width;
786
/****************************************************************************
787
* --- .Call ENTRY POINT ---
788
* Return a list of 2 elements: 1st elt is the narrowed cigar vector, 2nd elt
789
* is the 'rshift' vector i.e. the integer vector of the same length as 'cigar'
790
* that would need to be added to the 'pos' field of a SAM/BAM file as a
791
* consequence of this narrowing.
793
SEXP cigar_qnarrow(SEXP cigar, SEXP left_qwidth, SEXP right_qwidth)
795
SEXP ans, ans_cigar, ans_cigar_string, ans_rshift, cigar_string;
797
static char cigar_buf[1024];
800
cigar_length = LENGTH(cigar);
801
PROTECT(ans_cigar = NEW_CHARACTER(cigar_length));
802
PROTECT(ans_rshift = NEW_INTEGER(cigar_length));
803
for (i = 0; i < cigar_length; i++) {
804
cigar_string = STRING_ELT(cigar, i);
805
if (cigar_string == NA_STRING) {
806
SET_STRING_ELT(ans_cigar, i, NA_STRING);
807
INTEGER(ans_rshift)[i] = NA_INTEGER;
810
errmsg = qnarrow_cigar_string(cigar_string,
811
INTEGER(left_qwidth)[i],
812
INTEGER(right_qwidth)[i],
813
cigar_buf, INTEGER(ans_rshift) + i);
814
if (errmsg != NULL) {
816
error("in 'cigar' element %d: %s", i + 1, errmsg);
818
PROTECT(ans_cigar_string = mkChar(cigar_buf));
819
SET_STRING_ELT(ans_cigar, i, ans_cigar_string);
823
PROTECT(ans = NEW_LIST(2));
824
SET_VECTOR_ELT(ans, 0, ans_cigar);
825
SET_VECTOR_ELT(ans, 1, ans_rshift);
831
/****************************************************************************
832
* --- .Call ENTRY POINT ---
834
SEXP cigar_narrow(SEXP cigar, SEXP left_width, SEXP right_width)
836
SEXP ans, ans_cigar, ans_cigar_string, ans_rshift, cigar_string;
838
static char cigar_buf[1024];
841
cigar_length = LENGTH(cigar);
842
PROTECT(ans_cigar = NEW_CHARACTER(cigar_length));
843
PROTECT(ans_rshift = NEW_INTEGER(cigar_length));
844
for (i = 0; i < cigar_length; i++) {
845
cigar_string = STRING_ELT(cigar, i);
846
if (cigar_string == NA_STRING) {
847
SET_STRING_ELT(ans_cigar, i, NA_STRING);
848
INTEGER(ans_rshift)[i] = NA_INTEGER;
851
errmsg = narrow_cigar_string(cigar_string,
852
INTEGER(left_width)[i],
853
INTEGER(right_width)[i],
854
cigar_buf, INTEGER(ans_rshift) + i);
855
if (errmsg != NULL) {
857
error("in 'cigar' element %d: %s", i + 1, errmsg);
859
PROTECT(ans_cigar_string = mkChar(cigar_buf));
860
SET_STRING_ELT(ans_cigar, i, ans_cigar_string);
864
PROTECT(ans = NEW_LIST(2));
865
SET_VECTOR_ELT(ans, 0, ans_cigar);
866
SET_VECTOR_ELT(ans, 1, ans_rshift);
872
/****************************************************************************
873
* --- .Call ENTRY POINT ---
875
* cigar: character string containing the extended CIGAR;
876
* drop_D_ranges: TRUE or FALSE indicating whether Ds should be treated
878
* reduce_ranges: TRUE or FALSE indicating whether adjacent ranges coming
879
* from the same cigar should be merged or not.
880
* Return an IRanges object describing the alignment.
882
SEXP cigar_to_IRanges(SEXP cigar,
883
SEXP drop_D_ranges, SEXP drop_empty_ranges, SEXP reduce_ranges)
887
int Ds_as_Ns, drop_empty_ranges0, reduce_ranges0;
890
cigar_string = STRING_ELT(cigar, 0);
891
if (cigar_string == NA_STRING)
892
error("'cigar' is NA");
893
Ds_as_Ns = LOGICAL(drop_D_ranges)[0];
894
drop_empty_ranges0 = LOGICAL(drop_empty_ranges)[0];
895
reduce_ranges0 = LOGICAL(reduce_ranges)[0];
896
range_ae = new_RangeAE(0, 0);
897
errmsg = cigar_string_to_ranges(cigar_string, 1,
898
Ds_as_Ns, drop_empty_ranges0, reduce_ranges0,
902
return new_IRanges_from_RangeAE("IRanges", &range_ae);
906
/****************************************************************************
907
* --- .Call ENTRY POINT ---
909
* cigar: character vector containing the extended CIGAR string for each
911
* pos: integer vector containing the 1-based leftmost position/coordinate
912
* of the clipped read sequence;
913
* flag: NULL or an integer vector containing the SAM flag for each
915
* drop_D_ranges: TRUE or FALSE indicating whether Ds should be treated
917
* 'cigar', 'pos' and 'flag' (when not NULL) are assumed to have the same
918
* length (which is the number of aligned reads).
920
* Returns a CompressedIRangesList object of the same length as the input.
921
* NOTE: See note for cigar_to_list_of_IRanges_by_rname() below about the
923
* TODO: Support character factor 'cigar' in addition to current character
926
SEXP cigar_to_list_of_IRanges_by_alignment(SEXP cigar, SEXP pos, SEXP flag,
927
SEXP drop_D_ranges, SEXP drop_empty_ranges, SEXP reduce_ranges)
930
SEXP ans, ans_unlistData, ans_partitioning, ans_partitioning_end;
931
int cigar_length, Ds_as_Ns, drop_empty_ranges0, reduce_ranges0,
932
i, pos_elt, flag_elt;
936
cigar_length = LENGTH(cigar);
937
Ds_as_Ns = LOGICAL(drop_D_ranges)[0];
938
drop_empty_ranges0 = LOGICAL(drop_empty_ranges)[0];
939
reduce_ranges0 = LOGICAL(reduce_ranges)[0];
940
/* We will generate at least 'cigar_length' ranges. */
941
range_ae = new_RangeAE(cigar_length, 0);
942
PROTECT(ans_partitioning_end = NEW_INTEGER(cigar_length));
943
for (i = 0; i < cigar_length; i++) {
944
if (flag != R_NilValue) {
945
flag_elt = INTEGER(flag)[i];
946
if (flag_elt == NA_INTEGER) {
948
error("'flag' contains NAs");
950
if (flag_elt & 0x004)
953
cigar_string = STRING_ELT(cigar, i);
954
if (cigar_string == NA_STRING) {
956
error("'cigar' contains %sNAs",
957
flag != R_NilValue ? "unexpected " : "");
959
pos_elt = INTEGER(pos)[i];
960
if (pos_elt == NA_INTEGER) {
962
error("'pos' contains %sNAs",
963
flag != R_NilValue ? "unexpected " : "");
965
errmsg = cigar_string_to_ranges(cigar_string, pos_elt,
966
Ds_as_Ns, drop_empty_ranges0, reduce_ranges0,
968
if (errmsg != NULL) {
970
error("in 'cigar' element %d: %s", i + 1, errmsg);
972
INTEGER(ans_partitioning_end)[i] = RangeAE_get_nelt(&range_ae);
974
PROTECT(ans_unlistData = new_IRanges_from_RangeAE(
975
"IRanges", &range_ae));
976
PROTECT(ans_partitioning = new_PartitioningByEnd(
978
ans_partitioning_end, NULL));
979
PROTECT(ans = new_CompressedList(
980
"CompressedIRangesList",
981
ans_unlistData, ans_partitioning));
987
/****************************************************************************
988
* --- .Call ENTRY POINT ---
990
* cigar: character vector containing the extended CIGAR string for each
992
* rname: character factor containing the name of the reference sequence
993
* associated with each read (i.e. the name of the sequence the
994
* read has been aligned to);
995
* pos: integer vector containing the 1-based leftmost position/coordinate
996
* of the clipped read sequence;
997
* flag: NULL or an integer vector containing the SAM flag for each
999
* drop_D_ranges: TRUE or FALSE indicating whether Ds should be treated
1001
* reduce_ranges: TRUE or FALSE indicating whether adjacent ranges coming
1002
* from the same cigar should be merged or not.
1003
* 'cigar', 'rname', 'pos' and 'flag' (when not NULL) are assumed to have
1004
* the same length (which is the number of aligned reads).
1006
* Return a list of IRanges objects named with the factor levels in 'rname'.
1008
* NOTE: According to the SAM Format Specification (0.1.2-draft 20090820),
1009
* the CIGAR (and the read sequence) stored in the SAM file are represented
1010
* on the + strand of the reference sequence. This means that, for a read
1011
* that aligns to the - strand, the bases have been reverse complemented
1012
* from the unmapped read sequence, and that the corresponding CIGAR has
1013
* been reversed. So it seems that, for now, we don't need to deal with the
1014
* strand information at all (as long as we are only interested in
1015
* returning a list of IRanges objects that is suitable for coverage
1019
* - Support 'rname' of length 1.
1020
* - Support character factor 'cigar' in addition to current character vector
1023
SEXP cigar_to_list_of_IRanges_by_rname(SEXP cigar, SEXP rname, SEXP pos,
1025
SEXP drop_D_ranges, SEXP drop_empty_ranges, SEXP reduce_ranges)
1027
SEXP rname_levels, cigar_string, ans, ans_names;
1028
int ans_length, nreads, Ds_as_Ns, drop_empty_ranges0, reduce_ranges0,
1029
i, level, pos_elt, flag_elt;
1030
RangeAEAE range_aeae;
1033
rname_levels = GET_LEVELS(rname);
1034
ans_length = LENGTH(rname_levels);
1035
range_aeae = new_RangeAEAE(ans_length, ans_length);
1036
nreads = LENGTH(pos);
1037
Ds_as_Ns = LOGICAL(drop_D_ranges)[0];
1038
drop_empty_ranges0 = LOGICAL(drop_empty_ranges)[0];
1039
reduce_ranges0 = LOGICAL(reduce_ranges)[0];
1040
for (i = 0; i < nreads; i++) {
1041
if (flag != R_NilValue) {
1042
flag_elt = INTEGER(flag)[i];
1043
if (flag_elt == NA_INTEGER)
1044
error("'flag' contains NAs");
1045
if (flag_elt & 0x004)
1048
cigar_string = STRING_ELT(cigar, i);
1049
if (cigar_string == NA_STRING)
1050
error("'cigar' contains %sNAs",
1051
flag != R_NilValue ? "unexpected " : "");
1052
level = INTEGER(rname)[i];
1053
if (level == NA_INTEGER)
1054
error("'rname' contains %sNAs",
1055
flag != R_NilValue ? "unexpected " : "");
1056
pos_elt = INTEGER(pos)[i];
1057
if (pos_elt == NA_INTEGER)
1058
error("'pos' contains %sNAs",
1059
flag != R_NilValue ? "unexpected " : "");
1060
errmsg = cigar_string_to_ranges(cigar_string, pos_elt,
1061
Ds_as_Ns, drop_empty_ranges0, reduce_ranges0,
1062
range_aeae.elts + level - 1);
1064
error("in 'cigar' element %d: %s", i + 1, errmsg);
1066
PROTECT(ans = new_list_of_IRanges_from_RangeAEAE(
1067
"IRanges", &range_aeae));
1068
PROTECT(ans_names = duplicate(rname_levels));
1069
SET_NAMES(ans, ans_names);
1075
/****************************************************************************
1076
* --- .Call ENTRY POINT ---
1078
* ref_locs: global positions in the reference that we will map
1079
* cigar: character string containing the extended CIGAR;
1080
* pos: reference position at which the query alignment begins
1082
* narrow_left: whether to narrow to the left (or right) side of a gap
1083
* Returns the local query positions. This assumes that the reference
1084
* positions actually occur in the read alignment region, outside of
1085
* any deletions or insertions.
1087
SEXP ref_locs_to_query_locs(SEXP ref_locs, SEXP cigar, SEXP pos,
1092
Rboolean _narrow_left = asLogical(narrow_left);
1094
nlocs = LENGTH(ref_locs);
1095
PROTECT(query_locs = allocVector(INTSXP, nlocs));
1097
for (i = 0; i < nlocs; i++) {
1098
int query_loc = INTEGER(ref_locs)[i] - INTEGER(pos)[i] + 1;
1099
int n, offset = 0, OPL, query_consumed = 0;
1101
const char *cig0 = CHAR(STRING_ELT(cigar, i));
1102
while (query_consumed < query_loc &&
1103
(n = get_next_cigar_OP(cig0, offset, &OPL, &OP)))
1106
/* Alignment match (can be a sequence match or mismatch) */
1107
case 'M': case '=': case 'X':
1108
query_consumed += OPL;
1110
/* Insertion to the reference */
1112
/* Soft clip on the read */
1115
query_consumed += OPL;
1117
/* Deletion from the reference */
1119
case 'N': /* Skipped region from reference; narrow to query */
1121
Rboolean query_loc_past_gap = query_loc - query_consumed > OPL;
1122
if (query_loc_past_gap) {
1126
query_loc = query_consumed;
1128
query_loc = query_consumed + 1;
1133
/* Hard clip on the read */
1136
/* Silent deletion from the padded reference */
1145
error("hit end of cigar string %d: %s", i + 1, cig0);
1146
INTEGER(query_locs)[i] = query_loc;
1154
/****************************************************************************
1155
* --- .Call ENTRY POINT ---
1157
* query_locs: local positions in the read that we will map
1158
* cigar: character string containing the extended CIGAR;
1159
* pos: reference position at which the query alignment begins
1161
* narrow_left: whether to narrow to the left (or right) side of a gap
1162
* Returns the local query positions. This assumes that the reference
1163
* positions actually occur in the read alignment region, outside of
1164
* any deletions or insertions.
1166
SEXP query_locs_to_ref_locs(SEXP query_locs, SEXP cigar, SEXP pos,
1171
Rboolean _narrow_left = asLogical(narrow_left);
1173
nlocs = LENGTH(query_locs);
1174
PROTECT(ref_locs = allocVector(INTSXP, nlocs));
1176
for (i = 0; i < nlocs; i++) {
1177
int query_loc_i = INTEGER(query_locs)[i];
1178
int ref_loc = query_loc_i + INTEGER(pos)[i] - 1;
1179
int n, offset = 0, OPL, query_consumed = 0;
1181
const char *cig0 = CHAR(STRING_ELT(cigar, i));
1182
while (query_consumed < query_loc_i &&
1183
(n = get_next_cigar_OP(cig0, offset, &OPL, &OP)))
1186
/* Alignment match (can be a sequence match or mismatch) */
1187
case 'M': case '=': case 'X':
1188
query_consumed += OPL;
1190
/* Insertion to the reference */
1192
/* Soft clip on the read */
1193
int width_from_insertion_start = query_loc_i - query_consumed;
1194
Rboolean query_loc_past_insertion = width_from_insertion_start > OPL;
1195
if (query_loc_past_insertion) {
1198
ref_loc -= width_from_insertion_start;
1199
if (!_narrow_left) {
1203
query_consumed += OPL;
1207
query_consumed += OPL;
1210
/* Deletion from the reference */
1212
case 'N': /* Skipped region from reference; narrow to query */
1215
/* Hard clip on the read */
1218
/* Silent deletion from the padded reference */
1227
error("hit end of cigar string %d: %s", i + 1, cig0);
1228
INTEGER(ref_locs)[i] = ref_loc;