~ubuntu-branches/ubuntu/wily/r-bioc-genomeinfodb/wily-proposed

« back to all changes in this revision

Viewing changes to R/fetchExtendedChromInfoFromUCSC.R

  • Committer: Package Import Robot
  • Author(s): Andreas Tille
  • Date: 2014-06-08 18:33:04 UTC
  • Revision ID: package-import@ubuntu.com-20140608183304-mid40gs3cnaz8j3v
Tags: upstream-1.0.2
ImportĀ upstreamĀ versionĀ 1.0.2

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
### =========================================================================
 
2
### fetchExtendedChromInfoFromUCSC()
 
3
### -------------------------------------------------------------------------
 
4
 
 
5
 
 
6
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
7
### Some low-level helpers.
 
8
###
 
9
 
 
10
### Use this in GenomicFeatures/R/makeTranscriptDbFromUCSC.R instead of
 
11
### internal utility .downloadChromInfoFromUCSC().
 
12
fetch_ChromInfo_from_UCSC <- function(genome,
 
13
        goldenPath_url="http://hgdownload.cse.ucsc.edu/goldenPath")
 
14
{
 
15
    url <- paste(goldenPath_url, genome, "database/chromInfo.txt.gz", sep="/")
 
16
    destfile <- tempfile()
 
17
    download.file(url, destfile, quiet=TRUE)
 
18
    colnames <- c("chrom", "size", "fileName")
 
19
    read.table(destfile, sep="\t", quote="",
 
20
                         col.names=colnames, comment.char="",
 
21
                         stringsAsFactors=FALSE)
 
22
}
 
23
 
 
24
### Used in BSgenome!
 
25
fetch_GenBankAccn2seqlevel_from_NCBI <- function(assembly, AssemblyUnits=NULL)
 
26
{
 
27
    assembly_report <- fetch_assembly_report(assembly,
 
28
                                             AssemblyUnits=AssemblyUnits)
 
29
    GenBank_accn <- assembly_report[["GenBankAccn"]]
 
30
    if ("na" %in% GenBank_accn)
 
31
        stop("GenBankAccn field in assembly report for ",
 
32
             "\"", assembly, "\" contains \"na\"")
 
33
    stopifnot(anyDuplicated(GenBank_accn) == 0L)
 
34
    ans <- assembly_report[["SequenceName"]]
 
35
    names(ans) <- GenBank_accn
 
36
    ans
 
37
}
 
38
 
 
39
 
 
40
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
41
### fetchExtendedChromInfoFromUCSC()
 
42
###
 
43
 
 
44
### 'NCBI_seqlevels' and 'NCBI_accns' must be parallel vectors.
 
45
.map_UCSC_seqlevels_to_NCBI_seqlevels <- function(UCSC_seqlevels,
 
46
                                                  NCBI_seqlevels,
 
47
                                                  NCBI_accns,
 
48
                                                  special_renamings=NULL)
 
49
{
 
50
    suppressMessages(require(IRanges, quietly=TRUE))  # for elementLengths()
 
51
    ans <- rep.int(NA_integer_, length(UCSC_seqlevels))
 
52
 
 
53
    ## 1. Handle special renamings.
 
54
    if (!is.null(special_renamings)) {
 
55
        m1 <- match(names(special_renamings), UCSC_seqlevels)
 
56
        if (any(is.na(m1)))
 
57
            stop("'special_renamings' has names not in 'UCSC_seqlevels'")
 
58
        m2 <- match(special_renamings, NCBI_seqlevels)
 
59
        if (any(is.na(m2)))
 
60
            stop("'special_renamings' has values not in 'NCBI_seqlevels'")
 
61
        ans[m1] <- m2
 
62
    }
 
63
    unmapped_idx <- which(is.na(ans))
 
64
    if (length(unmapped_idx) == 0L)
 
65
        return(ans)
 
66
 
 
67
    ## 2. We assign based on exact matching of the sequence names (after
 
68
    ##    removal of the "chr" prefix).
 
69
    nochr_prefix_seqlevels <- sub("^chr", "", UCSC_seqlevels[unmapped_idx])
 
70
    m <- match(nochr_prefix_seqlevels, NCBI_seqlevels)
 
71
    ok_idx <- which(!is.na(m))
 
72
    ans[unmapped_idx[ok_idx]] <- m[ok_idx]
 
73
    unmapped_idx <- which(is.na(ans))
 
74
    if (length(unmapped_idx) == 0L)
 
75
        return(ans)
 
76
 
 
77
    ## 3. We assign based on the number embedded in the UCSC chromosome name.
 
78
    split_seqlevels <- strsplit(UCSC_seqlevels[unmapped_idx], "_")
 
79
    nparts <- elementLengths(split_seqlevels)
 
80
    idx2 <- which(nparts >= 2L)
 
81
    if (length(idx2) != 0L) {
 
82
        ans[unmapped_idx[idx2]] <- sapply(idx2,
 
83
            function(i2) {
 
84
                part2 <- split_seqlevels[[i2]][2L]
 
85
                part2 <- sub("v", ".", part2, fixed=TRUE)
 
86
                m <- grep(part2, NCBI_accns, ignore.case=TRUE)
 
87
                if (length(m) >= 2L)
 
88
                    stop("cannot map ", UCSC_seqlevels[unmapped_idx[i2]],
 
89
                         " to a unique NCBI seqlevel")
 
90
                if (length(m) == 0L)
 
91
                    return(NA_integer_)
 
92
                m
 
93
            })
 
94
    }
 
95
    ans
 
96
}
 
97
 
 
98
.standard_fetch_extended_ChromInfo_from_UCSC <- function(genome,
 
99
                                                         refseq_assembly_id,
 
100
                                                         AssemblyUnits,
 
101
                                                         special_renamings,
 
102
                                                         unmapped,
 
103
                                                         goldenPath_url)
 
104
{
 
105
    chrominfo <- fetch_ChromInfo_from_UCSC(genome,
 
106
                                goldenPath_url=goldenPath_url)
 
107
    assembly_report <- fetch_assembly_report(refseq_assembly_id,
 
108
                                             AssemblyUnits=AssemblyUnits)
 
109
    stopifnot(nrow(chrominfo) == nrow(assembly_report) + length(unmapped))
 
110
    UCSC_seqlevels <- chrominfo$chrom
 
111
    if (!is.null(unmapped)) {
 
112
        unmapped_idx <- match(unmapped, UCSC_seqlevels)
 
113
        stopifnot(!any(is.na(unmapped_idx)))
 
114
        warning(paste(unmapped, collapse=", "),
 
115
                ": not mapped to an NCBI seqlevel")
 
116
    }
 
117
    NCBI_GenBankAccns <- assembly_report$GenBankAccn
 
118
    NCBI_seqlevels <- assembly_report$SequenceName
 
119
    m <- .map_UCSC_seqlevels_to_NCBI_seqlevels(UCSC_seqlevels,
 
120
                                NCBI_seqlevels,
 
121
                                NCBI_GenBankAccns,
 
122
                                special_renamings=special_renamings)
 
123
    unexpectedly_unmapped_idx <-
 
124
        which(is.na(m) & !(UCSC_seqlevels %in% unmapped))
 
125
    if (length(unexpectedly_unmapped_idx) != 0L)
 
126
        stop("cannot map ",
 
127
             paste(UCSC_seqlevels[unexpectedly_unmapped_idx], collapse=", "),
 
128
             " to an NCBI seqlevel")
 
129
    NCBI_GenBankAccns[which(NCBI_GenBankAccns == "na")] <- NA_character_
 
130
    data.frame(UCSC_seqlevels=UCSC_seqlevels,
 
131
               UCSC_seqlengths=chrominfo$size,
 
132
               NCBI_seqlevels=NCBI_seqlevels[m],
 
133
               GenBank_accns=NCBI_GenBankAccns[m],
 
134
               stringsAsFactors=FALSE)
 
135
}
 
136
 
 
137
.SUPPORTED_GENOMES <- list(
 
138
    hg38=
 
139
        list(FUN=.standard_fetch_extended_ChromInfo_from_UCSC,
 
140
             refseq_assembly_id="GCF_000001405.26",
 
141
             special_renamings=c(chrM="MT")),
 
142
    hg19=
 
143
        list(FUN=.standard_fetch_extended_ChromInfo_from_UCSC,
 
144
             refseq_assembly_id="GCF_000001405.13",
 
145
             ## Special renaming of the 9 alternate scaffolds:
 
146
             special_renamings=c(chr6_apd_hap1="HSCHR6_MHC_APD_CTG1",
 
147
                                 chr6_cox_hap2="HSCHR6_MHC_COX_CTG1",
 
148
                                 chr6_dbb_hap3="HSCHR6_MHC_DBB_CTG1",
 
149
                                 chr6_mann_hap4="HSCHR6_MHC_MANN_CTG1",
 
150
                                 chr6_mcf_hap5="HSCHR6_MHC_MCF_CTG1",
 
151
                                 chr6_qbl_hap6="HSCHR6_MHC_QBL_CTG1",
 
152
                                 chr6_ssto_hap7="HSCHR6_MHC_SSTO_CTG1",
 
153
                                 chr4_ctg9_hap1="HSCHR4_1_CTG9",
 
154
                                 chr17_ctg5_hap1="HSCHR17_1_CTG5"),
 
155
             unmapped="chrM"),
 
156
    mm10=
 
157
        list(FUN=.standard_fetch_extended_ChromInfo_from_UCSC,
 
158
             refseq_assembly_id="GCF_000001635.20",
 
159
             AssemblyUnits=c("C57BL/6J", "non-nuclear"),
 
160
             special_renamings=c(chrM="MT")),
 
161
 
 
162
    ## It's impossible to map the sequences in mm9 with the sequences in
 
163
    ## MGSCv37 because the mapping doesn't seem to be one-to-one. For example,
 
164
    ## it seems that UCSC has merged the 102 unlocalized sequences on
 
165
    ## chromosome Y into a single sequence, the chrY_random sequence.
 
166
    ## So we can't support mm9.
 
167
    #mm9=
 
168
    #    list(FUN=.standard_fetch_extended_ChromInfo_from_UCSC,
 
169
    #         refseq_assembly_id="GCF_000001635.18",
 
170
    #         AssemblyUnits=c("C57BL/6J", "non-nuclear"),
 
171
    #         special_renamings=c(chrM="MT")),
 
172
    dm3=
 
173
        list(FUN=.standard_fetch_extended_ChromInfo_from_UCSC,
 
174
             refseq_assembly_id="GCF_000001215.2",
 
175
             special_renamings=c(chrM="MT", chrU="Un"),
 
176
             unmapped="chrUextra"),
 
177
    sacCer3=
 
178
        list(FUN=.standard_fetch_extended_ChromInfo_from_UCSC,
 
179
             refseq_assembly_id="GCF_000146045.2",
 
180
             special_renamings=c(chrM="MT"))
 
181
    ## more to come...
 
182
)
 
183
 
 
184
.genome2idx <- function(genome)
 
185
{
 
186
    refseq_assembly_id <- lookup_refseq_assembly_id(genome)
 
187
    if (is.na(refseq_assembly_id))
 
188
        return(NA_integer_)
 
189
    refseq_assembly_ids <- sapply(.SUPPORTED_GENOMES,
 
190
                                  `[[`, "refseq_assembly_id",
 
191
                                  USE.NAMES=FALSE)
 
192
    match(refseq_assembly_id, refseq_assembly_ids)
 
193
}
 
194
 
 
195
### Only supports UCSC genomes. However 'genome' can be:
 
196
###   (a) a UCSC assembly name (e.g. "hg38");
 
197
###   (b) a RefSeq Assembly ID (e.g. "GCF_000001405.26");
 
198
###   (c) a GenBank Assembly ID (e.g. "GCA_000001405.15");
 
199
###   (d) an NCBI assembly name (e.g. "GRCh38").
 
200
fetchExtendedChromInfoFromUCSC <- function(genome,
 
201
        goldenPath_url="http://hgdownload.cse.ucsc.edu/goldenPath")
 
202
{
 
203
    if (!.isSingleString(genome))
 
204
        stop("'genome' must be a single string")
 
205
    idx <- match(genome, names(.SUPPORTED_GENOMES))
 
206
    if (is.na(idx)) {
 
207
        idx <- .genome2idx(genome)
 
208
        if (is.na(idx))
 
209
            stop("genome \"", genome, "\" is not supported")
 
210
    }
 
211
    supported_genome <- .SUPPORTED_GENOMES[[idx]]
 
212
    supported_genome$FUN(names(.SUPPORTED_GENOMES)[idx],
 
213
                         supported_genome$refseq_assembly_id,
 
214
                         supported_genome$AssemblyUnits,
 
215
                         supported_genome$special_renamings,
 
216
                         supported_genome$unmapped,
 
217
                         goldenPath_url)
 
218
}
 
219