1
### =========================================================================
2
### fetchExtendedChromInfoFromUCSC()
3
### -------------------------------------------------------------------------
6
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
7
### Some low-level helpers.
10
### Use this in GenomicFeatures/R/makeTranscriptDbFromUCSC.R instead of
11
### internal utility .downloadChromInfoFromUCSC().
12
fetch_ChromInfo_from_UCSC <- function(genome,
13
goldenPath_url="http://hgdownload.cse.ucsc.edu/goldenPath")
15
url <- paste(goldenPath_url, genome, "database/chromInfo.txt.gz", sep="/")
16
destfile <- tempfile()
17
download.file(url, destfile, quiet=TRUE)
18
colnames <- c("chrom", "size", "fileName")
19
read.table(destfile, sep="\t", quote="",
20
col.names=colnames, comment.char="",
21
stringsAsFactors=FALSE)
25
fetch_GenBankAccn2seqlevel_from_NCBI <- function(assembly, AssemblyUnits=NULL)
27
assembly_report <- fetch_assembly_report(assembly,
28
AssemblyUnits=AssemblyUnits)
29
GenBank_accn <- assembly_report[["GenBankAccn"]]
30
if ("na" %in% GenBank_accn)
31
stop("GenBankAccn field in assembly report for ",
32
"\"", assembly, "\" contains \"na\"")
33
stopifnot(anyDuplicated(GenBank_accn) == 0L)
34
ans <- assembly_report[["SequenceName"]]
35
names(ans) <- GenBank_accn
40
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
41
### fetchExtendedChromInfoFromUCSC()
44
### 'NCBI_seqlevels' and 'NCBI_accns' must be parallel vectors.
45
.map_UCSC_seqlevels_to_NCBI_seqlevels <- function(UCSC_seqlevels,
48
special_renamings=NULL)
50
suppressMessages(require(IRanges, quietly=TRUE)) # for elementLengths()
51
ans <- rep.int(NA_integer_, length(UCSC_seqlevels))
53
## 1. Handle special renamings.
54
if (!is.null(special_renamings)) {
55
m1 <- match(names(special_renamings), UCSC_seqlevels)
57
stop("'special_renamings' has names not in 'UCSC_seqlevels'")
58
m2 <- match(special_renamings, NCBI_seqlevels)
60
stop("'special_renamings' has values not in 'NCBI_seqlevels'")
63
unmapped_idx <- which(is.na(ans))
64
if (length(unmapped_idx) == 0L)
67
## 2. We assign based on exact matching of the sequence names (after
68
## removal of the "chr" prefix).
69
nochr_prefix_seqlevels <- sub("^chr", "", UCSC_seqlevels[unmapped_idx])
70
m <- match(nochr_prefix_seqlevels, NCBI_seqlevels)
71
ok_idx <- which(!is.na(m))
72
ans[unmapped_idx[ok_idx]] <- m[ok_idx]
73
unmapped_idx <- which(is.na(ans))
74
if (length(unmapped_idx) == 0L)
77
## 3. We assign based on the number embedded in the UCSC chromosome name.
78
split_seqlevels <- strsplit(UCSC_seqlevels[unmapped_idx], "_")
79
nparts <- elementLengths(split_seqlevels)
80
idx2 <- which(nparts >= 2L)
81
if (length(idx2) != 0L) {
82
ans[unmapped_idx[idx2]] <- sapply(idx2,
84
part2 <- split_seqlevels[[i2]][2L]
85
part2 <- sub("v", ".", part2, fixed=TRUE)
86
m <- grep(part2, NCBI_accns, ignore.case=TRUE)
88
stop("cannot map ", UCSC_seqlevels[unmapped_idx[i2]],
89
" to a unique NCBI seqlevel")
98
.standard_fetch_extended_ChromInfo_from_UCSC <- function(genome,
105
chrominfo <- fetch_ChromInfo_from_UCSC(genome,
106
goldenPath_url=goldenPath_url)
107
assembly_report <- fetch_assembly_report(refseq_assembly_id,
108
AssemblyUnits=AssemblyUnits)
109
stopifnot(nrow(chrominfo) == nrow(assembly_report) + length(unmapped))
110
UCSC_seqlevels <- chrominfo$chrom
111
if (!is.null(unmapped)) {
112
unmapped_idx <- match(unmapped, UCSC_seqlevels)
113
stopifnot(!any(is.na(unmapped_idx)))
114
warning(paste(unmapped, collapse=", "),
115
": not mapped to an NCBI seqlevel")
117
NCBI_GenBankAccns <- assembly_report$GenBankAccn
118
NCBI_seqlevels <- assembly_report$SequenceName
119
m <- .map_UCSC_seqlevels_to_NCBI_seqlevels(UCSC_seqlevels,
122
special_renamings=special_renamings)
123
unexpectedly_unmapped_idx <-
124
which(is.na(m) & !(UCSC_seqlevels %in% unmapped))
125
if (length(unexpectedly_unmapped_idx) != 0L)
127
paste(UCSC_seqlevels[unexpectedly_unmapped_idx], collapse=", "),
128
" to an NCBI seqlevel")
129
NCBI_GenBankAccns[which(NCBI_GenBankAccns == "na")] <- NA_character_
130
data.frame(UCSC_seqlevels=UCSC_seqlevels,
131
UCSC_seqlengths=chrominfo$size,
132
NCBI_seqlevels=NCBI_seqlevels[m],
133
GenBank_accns=NCBI_GenBankAccns[m],
134
stringsAsFactors=FALSE)
137
.SUPPORTED_GENOMES <- list(
139
list(FUN=.standard_fetch_extended_ChromInfo_from_UCSC,
140
refseq_assembly_id="GCF_000001405.26",
141
special_renamings=c(chrM="MT")),
143
list(FUN=.standard_fetch_extended_ChromInfo_from_UCSC,
144
refseq_assembly_id="GCF_000001405.13",
145
## Special renaming of the 9 alternate scaffolds:
146
special_renamings=c(chr6_apd_hap1="HSCHR6_MHC_APD_CTG1",
147
chr6_cox_hap2="HSCHR6_MHC_COX_CTG1",
148
chr6_dbb_hap3="HSCHR6_MHC_DBB_CTG1",
149
chr6_mann_hap4="HSCHR6_MHC_MANN_CTG1",
150
chr6_mcf_hap5="HSCHR6_MHC_MCF_CTG1",
151
chr6_qbl_hap6="HSCHR6_MHC_QBL_CTG1",
152
chr6_ssto_hap7="HSCHR6_MHC_SSTO_CTG1",
153
chr4_ctg9_hap1="HSCHR4_1_CTG9",
154
chr17_ctg5_hap1="HSCHR17_1_CTG5"),
157
list(FUN=.standard_fetch_extended_ChromInfo_from_UCSC,
158
refseq_assembly_id="GCF_000001635.20",
159
AssemblyUnits=c("C57BL/6J", "non-nuclear"),
160
special_renamings=c(chrM="MT")),
162
## It's impossible to map the sequences in mm9 with the sequences in
163
## MGSCv37 because the mapping doesn't seem to be one-to-one. For example,
164
## it seems that UCSC has merged the 102 unlocalized sequences on
165
## chromosome Y into a single sequence, the chrY_random sequence.
166
## So we can't support mm9.
168
# list(FUN=.standard_fetch_extended_ChromInfo_from_UCSC,
169
# refseq_assembly_id="GCF_000001635.18",
170
# AssemblyUnits=c("C57BL/6J", "non-nuclear"),
171
# special_renamings=c(chrM="MT")),
173
list(FUN=.standard_fetch_extended_ChromInfo_from_UCSC,
174
refseq_assembly_id="GCF_000001215.2",
175
special_renamings=c(chrM="MT", chrU="Un"),
176
unmapped="chrUextra"),
178
list(FUN=.standard_fetch_extended_ChromInfo_from_UCSC,
179
refseq_assembly_id="GCF_000146045.2",
180
special_renamings=c(chrM="MT"))
184
.genome2idx <- function(genome)
186
refseq_assembly_id <- lookup_refseq_assembly_id(genome)
187
if (is.na(refseq_assembly_id))
189
refseq_assembly_ids <- sapply(.SUPPORTED_GENOMES,
190
`[[`, "refseq_assembly_id",
192
match(refseq_assembly_id, refseq_assembly_ids)
195
### Only supports UCSC genomes. However 'genome' can be:
196
### (a) a UCSC assembly name (e.g. "hg38");
197
### (b) a RefSeq Assembly ID (e.g. "GCF_000001405.26");
198
### (c) a GenBank Assembly ID (e.g. "GCA_000001405.15");
199
### (d) an NCBI assembly name (e.g. "GRCh38").
200
fetchExtendedChromInfoFromUCSC <- function(genome,
201
goldenPath_url="http://hgdownload.cse.ucsc.edu/goldenPath")
203
if (!.isSingleString(genome))
204
stop("'genome' must be a single string")
205
idx <- match(genome, names(.SUPPORTED_GENOMES))
207
idx <- .genome2idx(genome)
209
stop("genome \"", genome, "\" is not supported")
211
supported_genome <- .SUPPORTED_GENOMES[[idx]]
212
supported_genome$FUN(names(.SUPPORTED_GENOMES)[idx],
213
supported_genome$refseq_assembly_id,
214
supported_genome$AssemblyUnits,
215
supported_genome$special_renamings,
216
supported_genome$unmapped,