1
### =========================================================================
2
### The GenomicRanges interface
3
### -------------------------------------------------------------------------
6
### TODO: The 'constraint' slot could be moved to the Vector class (or to the
7
### Annotated class) so any Vector object could be constrained.
8
setClass("GenomicRanges",
12
#No more constraint slot for now...
13
#constraint="ConstraintORNULL"
17
setClassUnion("GenomicRangesORmissing", c("GenomicRanges", "missing"))
19
### The code in this file will work out-of-the-box on 'x' as long as
20
### seqnames(x), ranges(x), strand(x), seqlengths(x), seqinfo(),
21
### update(x) and clone(x) are defined.
24
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
25
### 2 non-exported low-level helper functions.
27
### For the 2 functions below, 'x' must be a GenomicRanges object.
28
### They both return a named integer vector where the names are guaranteed
29
### to be 'seqlevels(x)'.
32
minStartPerGRangesSequence <- function(x)
34
cil <- splitAsList(start(x), seqnames(x)) # CompressedIntegerList object
35
v <- Views(cil@unlistData, cil@partitioning) # XIntegerViews object
37
ans[width(v) == 0L] <- NA_integer_
38
names(ans) <- names(v)
42
maxEndPerGRangesSequence <- function(x)
44
cil <- splitAsList(end(x), seqnames(x)) # CompressedIntegerList object
45
v <- Views(cil@unlistData, cil@partitioning) # XIntegerViews object
47
ans[width(v) == 0L] <- NA_integer_
48
names(ans) <- names(v)
53
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
57
setMethod("length", "GenomicRanges", function(x) length(seqnames(x)))
59
setMethod("names", "GenomicRanges", function(x) names(ranges(x)))
61
#setMethod("constraint", "GenomicRanges", function(x) x@constraint)
64
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
67
### TODO: Should we enforce ranges(x) to be an IRanges *instance* (i.e.
68
### class(ranges(x) == "IRanges")) instead of just an IRanges *object* (i.e.
69
### is(ranges(x), "IRanges"))? Right now I can create a GRanges object where
70
### the ranges are a Views object on a very long DNAString subject with
71
### something like 'GRanges("chr1", Views(subject, start=1:2, end=5))'.
72
### Sounds cool but there are also some potential complications with this...
74
.valid.GenomicRanges.length <- function(x)
76
n <- length(seqnames(x))
77
if ((length(ranges(x)) != n)
78
|| (length(strand(x)) != n)
79
|| (nrow(mcols(x)) != n))
80
return("slot lengths are not all equal")
84
.valid.GenomicRanges.seqnames <- function(x)
86
if (!is.factor(runValue(seqnames(x))))
87
return("'seqnames' should be a 'factor' Rle")
88
if (IRanges:::anyMissing(runValue(seqnames(x))))
89
return("'seqnames' contains missing values")
93
.valid.GenomicRanges.ranges <- function(x)
95
if (class(ranges(x)) != "IRanges")
96
return("'ranges(x)' must be an IRanges instance")
100
.valid.GenomicRanges.strand <- function(x)
102
if (!is.factor(runValue(strand(x))) ||
103
!identical(levels(runValue(strand(x))), levels(strand())))
105
msg <- c("'strand' should be a 'factor' Rle with levels c(",
106
paste0('"', levels(strand()), '"', collapse=", "),
108
return(paste(msg, collapse=""))
110
if (IRanges:::anyMissing(runValue(strand(x))))
111
return("'strand' contains missing values")
115
### NOTE: This list is also included in the man page for GRanges objects.
116
### Keep the 2 lists in sync!
117
### We don't put "genome" in that list in order to facilitate import of GFF3
118
### files as GRanges objects (see ?import.gff3 in rtracklayer).
119
INVALID.GR.COLNAMES <- c("seqnames", "ranges", "strand",
120
"seqlevels", "seqlengths", "isCircular",
122
"start", "end", "width", "element")
124
.valid.GenomicRanges.mcols <- function(x)
126
if (any(INVALID.GR.COLNAMES %in% colnames(mcols(x)))) {
127
msg <- c("names of metadata columns cannot be one of ",
128
paste0("\"", INVALID.GR.COLNAMES, "\"", collapse=", "))
129
return(paste(msg, collapse=" "))
134
### Also used by the validity method for GappedAlignments objects.
135
valid.GenomicRanges.seqinfo <- function(x)
137
x_seqinfo <- seqinfo(x)
138
if (!identical(seqlevels(x_seqinfo), levels(seqnames(x)))) {
139
msg <- c("'seqlevels(seqinfo(x))' and 'levels(seqnames(x))'",
141
return(paste(msg, collapse=" "))
143
x_seqlengths <- seqlengths(x_seqinfo)
144
seqs_with_known_length <- names(x_seqlengths)[!is.na(x_seqlengths)]
145
if (length(seqs_with_known_length) == 0L)
147
if (any(x_seqlengths < 0L, na.rm=TRUE))
148
return("'seqlengths(x)' contains negative values")
149
## We check only the ranges that are on a non-circular sequence with
151
x_isCircular <- isCircular(x_seqinfo)
152
non_circ_seqs <- names(x_isCircular)[!(x_isCircular %in% TRUE)]
153
ncswkl <- intersect(non_circ_seqs, seqs_with_known_length)
154
if (length(ncswkl) == 0L)
156
x_seqnames <- seqnames(x)
157
runValue(x_seqnames) <- runValue(x_seqnames)[drop=TRUE]
158
minStarts <- minStartPerGRangesSequence(x)
159
maxEnds <- maxEndPerGRangesSequence(x)
160
if (any(minStarts[ncswkl] < 1L, na.rm=TRUE)
161
|| any(maxEnds[ncswkl] >
162
seqlengths(x)[ncswkl], na.rm=TRUE))
163
warning("'ranges' contains values outside of sequence bounds. ",
164
"See ?trim to subset ranges.")
168
.valid.GenomicRanges <- function(x)
170
c(.valid.GenomicRanges.length(x),
171
.valid.GenomicRanges.seqnames(x),
172
.valid.GenomicRanges.ranges(x),
173
.valid.GenomicRanges.strand(x),
174
.valid.GenomicRanges.mcols(x),
175
valid.GenomicRanges.seqinfo(x))
176
#checkConstraint(x, constraint(x)))
179
setValidity2("GenomicRanges", .valid.GenomicRanges)
182
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
186
setAs("GenomicRanges", "RangedData",
189
rd <- RangedData(ranges(from), strand=strand(from),
190
mcols(from), space=seqnames(from))
191
mcols(ranges(rd)) <- DataFrame(seqlengths=seqlengths(from),
192
isCircular=isCircular(from),
194
metadata(ranges(rd)) <- metadata(from)
195
metadata(ranges(rd))$seqinfo <- seqinfo(from)
200
setAs("GenomicRanges", "RangesList",
203
rl <- split(ranges(from), seqnames(from))
204
mcols_list <- split(DataFrame(strand=strand(from), mcols(from)),
206
rl <- mendoapply(function(ranges, metadata) {
207
mcols(ranges) <- metadata
210
mcols(rl) <- DataFrame(seqlengths=seqlengths(from),
211
isCircular=isCircular(from),
213
metadata(rl) <- metadata(from)
214
metadata(rl)$seqinfo <- seqinfo(from)
219
setMethod("as.data.frame", "GenomicRanges",
220
function(x, row.names=NULL, optional=FALSE, ...)
223
if (missing(row.names))
224
row.names <- names(x)
225
if (!is.null(names(x)))
227
data.frame(seqnames=as.factor(seqnames(x)),
231
strand=as.factor(strand(x)),
232
as.data.frame(mcols(x), ...),
234
stringsAsFactors=FALSE)
238
.fromSeqinfoToGRanges <- function(from)
240
if (any(is.na(seqlengths(from))))
241
stop("cannot create a GenomicRanges from a Seqinfo ",
242
"with NA seqlengths")
243
GRanges(seqnames(from),
244
IRanges(rep(1L, length(from)),
245
width=seqlengths(from),
246
names=seqnames(from)),
250
setAs("Seqinfo", "GenomicRanges", .fromSeqinfoToGRanges)
253
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
257
setReplaceMethod("names", "GenomicRanges",
260
names(ranges(x)) <- value
265
setReplaceMethod("seqnames", "GenomicRanges",
268
if (!is(value, "Rle"))
270
if (!is.factor(runValue(value)))
271
runValue(value) <- factor(runValue(value))
272
if (!identical(levels(value), seqlevels(x)))
273
stop("levels of supplied 'seqnames' must be ",
274
"identical to 'seqlevels(x)'")
278
if ((k == 0L) || (k > n) || (n %% k != 0L))
279
stop(k, " elements in value to replace ", n, " elements")
280
value <- rep(value, length.out=n)
282
update(x, seqnames=value)
286
setReplaceMethod("ranges", "GenomicRanges",
289
if (!is(value, "IRanges"))
290
value <- as(value, "IRanges")
294
if ((k == 0L) || (k > n) || (n %% k != 0L))
295
stop(k, " elements in value to replace ", n, "elements")
296
value <- rep(value, length.out=n)
298
update(x, ranges=value, check=FALSE)
302
normargGenomicRangesStrand <- function(strand, n)
304
if (!is(strand, "Rle"))
305
strand <- Rle(strand)
306
if (!is.factor(runValue(strand))
307
|| !identical(levels(runValue(strand)), levels(strand())))
308
runValue(strand) <- strand(runValue(strand))
311
if (k != 1L && (k == 0L || k > n || n %% k != 0L))
312
stop("supplied 'strand' has ", k, " elements (", n, " expected)")
313
strand <- rep(strand, length.out=n)
318
setReplaceMethod("strand", "GenomicRanges",
321
value <- normargGenomicRangesStrand(value, length(x))
322
x <- update(x, strand=value, check=FALSE)
323
msg <- .valid.GenomicRanges.strand(x)
330
setReplaceMethod("elementMetadata", "GenomicRanges",
331
function(x, ..., value)
333
value <- mk_elementMetadataReplacementValue(x, value)
334
x <- update(x, elementMetadata=value, check=FALSE)
335
msg <- .valid.GenomicRanges.mcols(x)
342
setReplaceMethod("seqinfo", "GenomicRanges",
343
function(x, new2old=NULL, force=FALSE, value)
345
if (!is(value, "Seqinfo"))
346
stop("the supplied 'seqinfo' must be a Seqinfo object")
347
dangling_seqlevels <- getDanglingSeqlevels(x,
348
new2old=new2old, force=force,
350
if (length(dangling_seqlevels) != 0L)
351
x <- x[!(seqnames(x) %in% dangling_seqlevels)]
352
old_seqinfo <- seqinfo(x)
353
new_seqnames <- makeNewSeqnames(x, new2old=new2old, seqlevels(value))
354
x <- update(x, seqnames=new_seqnames, seqinfo=value, check=FALSE)
355
geom_has_changed <- sequenceGeometryHasChanged(seqinfo(x), old_seqinfo,
357
if (any(geom_has_changed, na.rm=TRUE)) {
358
msg <- valid.GenomicRanges.seqinfo(x)
366
setMethod("score", "GenomicRanges", function(x) mcols(x)$score)
368
#setReplaceMethod("constraint", "GenomicRanges",
371
# if (isSingleString(value))
372
# value <- new(value)
373
# if (!is(value, "ConstraintORNULL"))
374
# stop("the supplied 'constraint' must be a ",
375
# "Constraint object, a single string, or NULL")
376
# x@constraint <- value
383
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
384
### Updating and cloning.
386
### An object is either 'update'd in place (usually with a replacement
387
### method) or 'clone'd (copied), with specified slots/fields overridden.
389
### For an object with a pure S4 slot representation, these both map to
390
### initialize. Reference classes will want to override 'update'. Other
391
### external representations need further customization.
393
setMethod("update", "GenomicRanges", # not exported
394
function(object, ..., check=TRUE)
396
initialize(object, ...)
400
setGeneric("clone", function(x, ...) standardGeneric("clone")) # not exported
402
setMethod("clone", "ANY", # not exported
413
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
417
setMethod("start", "GenomicRanges", function(x, ...) start(ranges(x)))
418
setMethod("end", "GenomicRanges", function(x, ...) end(ranges(x)))
419
setMethod("width", "GenomicRanges", function(x) width(ranges(x)))
421
setReplaceMethod("start", "GenomicRanges",
422
function(x, check=TRUE, value)
424
if (!is.integer(value))
425
value <- as.integer(value)
427
starts <- start(ranges)
429
## TODO: Revisit this to handle circularity (maybe).
430
if (!IRanges:::anyMissing(seqlengths(x))) {
431
if (IRanges:::anyMissingOrOutside(starts, 1L)) {
432
warning("trimmed start values to be positive")
433
starts[starts < 1L] <- 1L
436
start(ranges, check=check) <- starts
437
update(x, ranges=ranges, check=FALSE)
441
setReplaceMethod("end", "GenomicRanges",
442
function(x, check=TRUE, value)
444
if (!is.integer(value))
445
value <- as.integer(value)
449
seqlengths <- seqlengths(x)
450
## TODO: Revisit this to handle circularity.
451
if (!IRanges:::anyMissing(seqlengths)) {
452
seqlengths <- seqlengths[seqlevels(x)]
453
maxEnds <- seqlengths[as.integer(seqnames(x))]
454
trim <- which(ends > maxEnds)
455
if (length(trim) > 0L) {
456
warning("trimmed end values to be <= seqlengths")
457
ends[trim] <- maxEnds[trim]
460
end(ranges, check=check) <- ends
461
update(x, ranges=ranges, check=FALSE)
465
setReplaceMethod("width", "GenomicRanges",
466
function(x, check=TRUE, value)
468
if (!is.integer(value))
469
value <- as.integer(value)
470
if (!IRanges:::anyMissing(seqlengths(x))) {
471
end(x) <- start(x) + (value - 1L)
474
width(ranges, check=check) <- value
475
x <- update(x, ranges=ranges, check=FALSE)
482
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
483
### Subsetting and combining.
486
setMethod("[", "GenomicRanges",
487
function(x, i, j, ..., drop)
489
if (length(list(...)) > 0L)
490
stop("invalid subsetting")
491
if (missing(i) && missing(j))
493
x_mcols <- mcols(x, FALSE)
495
ans_mcols <- x_mcols[ , j, drop=FALSE]
496
return(clone(x, elementMetadata=ans_mcols))
498
i <- IRanges:::normalizeSingleBracketSubscript(i, x)
499
ans_seqnames <- seqnames(x)[i]
500
ans_ranges <- ranges(x)[i]
501
ans_strand <- strand(x)[i]
503
ans_mcols <- x_mcols[i, , drop=FALSE]
505
ans_mcols <- x_mcols[i, j, drop=FALSE]
507
clone(x, seqnames=ans_seqnames,
510
elementMetadata=ans_mcols)
514
setReplaceMethod("[", "GenomicRanges",
515
function(x, i, j, ..., value)
517
if (!is(value, "GenomicRanges"))
518
stop("replacement value must be a GenomicRanges object")
519
seqinfo(x) <- merge(seqinfo(x), seqinfo(value))
520
seqnames <- seqnames(x)
523
ans_mcols <- mcols(x, FALSE)
525
iInfo <- IRanges:::.bracket.Index(i, length(x), names(x))
526
if (!is.null(iInfo[["msg"]]))
529
if (missing(i) || !iInfo[["useIdx"]]) {
530
seqnames[] <- seqnames(value)
531
ranges[] <- ranges(value)
532
strand[] <- strand(value)
534
ans_mcols[ , ] <- mcols(value, FALSE)
536
ans_mcols[ , j] <- mcols(value, FALSE)
539
seqnames[i] <- seqnames(value)
540
ranges[i] <- ranges(value)
541
strand[i] <- strand(value)
543
ans_mcols[i, ] <- mcols(value, FALSE)
545
ans_mcols[i, j] <- mcols(value, FALSE)
547
update(x, seqnames=seqnames, ranges=ranges,
548
strand=strand, elementMetadata=ans_mcols)
552
### Not exported. 'x' *must* be an unnamed list of length >= 1 (not checked).
553
.unlist_list_of_GenomicRanges <- function(x, ignore.mcols=FALSE)
555
if (!isTRUEorFALSE(ignore.mcols))
556
stop("'ignore.mcols' must be TRUE or FALSE")
557
ans_class <- class(x[[1L]])
558
ans_seqinfo <- do.call(merge, lapply(x, seqinfo))
559
ans_seqnames <- do.call(c, lapply(x, seqnames))
560
ans_ranges <- do.call(c, lapply(x, ranges))
561
ans_strand <- do.call(c, lapply(x, strand))
563
ans_mcols <- new("DataFrame", nrows=length(ans_ranges))
565
ans_mcols <- do.call(rbind, lapply(x, mcols, FALSE))
567
new(ans_class, seqnames=ans_seqnames, ranges=ans_ranges, strand=ans_strand,
568
elementMetadata=ans_mcols, seqinfo=ans_seqinfo)
571
setMethod("c", "GenomicRanges",
572
function(x, ..., ignore.mcols=FALSE, .ignoreElementMetadata=FALSE,
575
if (!identical(recursive, FALSE))
576
stop("'recursive' argument not supported")
577
if (!isTRUEorFALSE(.ignoreElementMetadata))
578
stop("'.ignoreElementMetadata' must be TRUE or FALSE")
579
if (.ignoreElementMetadata) {
580
msg <- c("the '.ignoreElementMetadata' argument is defunct, ",
581
"please use 'ignore.mcols'\n instead")
585
args <- unname(list(...))
587
args <- unname(list(x, ...))
588
.unlist_list_of_GenomicRanges(args, ignore.mcols=ignore.mcols)
592
setMethod("seqselect", "GenomicRanges",
593
function(x, start=NULL, end=NULL, width=NULL)
595
if (!is.null(end) || !is.null(width))
596
start <- IRanges(start=start, end=end, width=width)
597
irInfo <- IRanges:::.bracket.Index(start, length(x), names(x),
599
if (!is.null(irInfo[["msg"]]))
600
stop(irInfo[["msg"]])
601
if (!irInfo[["useIdx"]])
603
ir <- irInfo[["idx"]]
604
ranges <- seqselect(ranges(x), ir)
606
seqnames=seqselect(seqnames(x), ir),
608
strand=seqselect(strand(x), ir),
609
elementMetadata=seqselect(elementMetadata(x, FALSE), ir))
613
setReplaceMethod("seqselect", "GenomicRanges",
614
function(x, start=NULL, end=NULL, width=NULL, value)
616
if (!is(value, "GenomicRanges"))
617
stop("replacement value must be a GenomicRanges object")
618
seqinfo(x) <- merge(seqinfo(x), seqinfo(value))
619
if (is.null(end) && is.null(width)) {
621
ir <- IRanges(start=1L, width=length(x))
622
else if (is(start, "Ranges"))
625
if (is.logical(start) && length(start) != length(x))
626
start <- rep(start, length.out=length(x))
627
ir <- as(start, "IRanges")
630
ir <- IRanges(start=start, end=end, width=width, names=NULL)
633
if (length(ir) == 0L) {
636
seqnames <- as.factor(seqnames(x))
638
strand <- as.factor(strand(x))
639
mcols <- mcols(x, FALSE)
640
seqselect(seqnames, ir) <- as.factor(seqnames(value))
641
seqselect(ranges, ir) <- ranges(value)
642
seqselect(strand, ir) <- as.factor(strand(value))
643
seqselect(mcols, ir) <- mcols(value)
644
update(x, seqnames=Rle(seqnames), ranges=ranges,
646
elementMetadata=mcols)
651
setMethod("window", "GenomicRanges",
652
function(x, start=NA, end=NA, width=NA,
653
frequency=NULL, delta=NULL, ...)
656
seqnames=window(seqnames(x), start=start, end=end,
657
width=width, frequency=frequency,
659
ranges=window(ranges(x), start=start, end=end,
660
width=width, frequency=frequency,
662
strand=window(strand(x), start=start, end=end,
663
width=width, frequency=frequency,
665
elementMetadata=window(elementMetadata(x, FALSE),
666
start=start, end=end,
667
width=width, frequency=frequency,
673
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
674
### $ and $<- methods
676
### Provided as a convenience, for GenomicRanges *only*, and as the result
677
### of strong popular demand.
678
### Note that those methods are not consistent with the other $ and $<-
679
### methods in the IRanges/GenomicRanges infrastructure, and might confuse
680
### some users by making them believe that a GenomicRanges object can be
681
### manipulated as a data.frame-like object.
682
### Therefore we recommend using them only interactively, and we discourage
683
### their use in scripts or packages. For the latter, use 'mcols(x)$name'
684
### instead of 'x$name'.
687
.DollarNames.GenomicRanges <- function(x, pattern)
688
grep(pattern, names(mcols(x)), value=TRUE)
690
setMethod("$", "GenomicRanges",
691
function(x, name) mcols(x)[[name]]
694
setReplaceMethod("$", "GenomicRanges",
695
function(x, name, value) {mcols(x)[[name]] <- value; x}
699
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
703
.makeNakedMatFromGenomicRanges <- function(x)
707
ans <- cbind(seqnames=as.character(seqnames(x)),
708
ranges=IRanges:::showAsCell(ranges(x)),
709
strand=as.character(strand(x)))
711
tmp <- do.call(data.frame, c(lapply(mcols(x),
712
IRanges:::showAsCell),
713
list(check.names=FALSE)))
714
ans <- cbind(ans, `|`=rep.int("|", lx), as.matrix(tmp))
719
showGenomicRanges <- function(x, margin="",
720
with.classinfo=FALSE, print.seqlengths=FALSE)
724
cat(class(x), " with ",
725
lx, " ", ifelse(lx == 1L, "range", "ranges"),
727
nc, " metadata ", ifelse(nc == 1L, "column", "columns"),
729
out <- makePrettyMatrixForCompactPrinting(x,
730
.makeNakedMatFromGenomicRanges)
731
if (with.classinfo) {
737
classinfo <- makeClassinfoRowForCompactPrinting(x, .COL2CLASS)
738
## A sanity check, but this should never happen!
739
stopifnot(identical(colnames(classinfo), colnames(out)))
740
out <- rbind(classinfo, out)
743
rownames(out) <- paste0(margin, rownames(out))
744
print(out, quote=FALSE, right=TRUE)
745
if (print.seqlengths) {
746
cat(margin, "---\n", sep="")
747
showSeqlengths(x, margin=margin)
751
setMethod("show", "GenomicRanges",
753
showGenomicRanges(object, margin=" ",
754
with.classinfo=TRUE, print.seqlengths=TRUE)