~ubuntu-branches/ubuntu/wily/r-bioc-genomeinfodb/wily-proposed

« back to all changes in this revision

Viewing changes to R/fetchExtendedChromInfoFromUCSC.R

Committer: Package Import Robot
Author(s): Andreas Tille
Date: 2014-06-08 18:33:04 UTC
Revision ID: package-import@ubuntu.com-20140608183304-mid40gs3cnaz8j3v

Tags: upstream-1.0.2

Import upstream version 1.0.2

files added:

DESCRIPTION

NAMESPACE

NEWS

R/assembly-utils.R

R/fetchExtendedChromInfoFromUCSC.R

R/rankSeqlevels.R

R/seqlevels-helpers.R

R/seqlevels.R

build

build/vignette.rds

inst

inst/doc

inst/doc/Accept-organism-for-GenomeInfoDb.R

inst/doc/Accept-organism-for-GenomeInfoDb.Rnw

inst/doc/Accept-organism-for-GenomeInfoDb.pdf

inst/doc/GenomeInfoDb.R

inst/doc/GenomeInfoDb.Rnw

inst/doc/GenomeInfoDb.pdf

inst/extdata

inst/extdata/dataFiles

inst/extdata/dataFiles/Arabidopsis_thaliana.txt

inst/extdata/dataFiles/Caenorhabditis_elegans.txt

inst/extdata/dataFiles/Cyanidioschyzon_merolae.txt

inst/extdata/dataFiles/Drosophila_melanogaster.txt

inst/extdata/dataFiles/Homo_sapiens.txt

inst/extdata/dataFiles/Oryza_sativa.txt

inst/extdata/dataFiles/Populus_trichocarpa.txt

inst/extdata/dataFiles/Saccharomyces_cerevisiae.txt

inst/extdata/dataFiles/Zea_mays.txt

inst/unitTests

inst/unitTests/test_seqnames.R

man/GenomeInfoDb-helpers.Rd

man/fetchExtendedChromInfoFromUCSC.Rd

man/rankSeqlevels.Rd

tests

tests/runTests.R

vignettes

vignettes/Accept-organism-for-GenomeInfoDb.Rnw

vignettes/GenomeInfoDb.Rnw

Show diffs side-by-side

added added

removed removed

R/fetchExtendedChromInfoFromUCSC.R

### =========================================================================

### fetchExtendedChromInfoFromUCSC()

### -------------------------------------------------------------------------

### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

### Some low-level helpers.

###

### Use this in GenomicFeatures/R/makeTranscriptDbFromUCSC.R instead of

### internal utility .downloadChromInfoFromUCSC().

fetch_ChromInfo_from_UCSC <- function(genome,

goldenPath_url="http://hgdownload.cse.ucsc.edu/goldenPath")

{

url <- paste(goldenPath_url, genome, "database/chromInfo.txt.gz", sep="/")

destfile <- tempfile()

download.file(url, destfile, quiet=TRUE)

colnames <- c("chrom", "size", "fileName")

read.table(destfile, sep="\t", quote="",

col.names=colnames, comment.char="",

stringsAsFactors=FALSE)

}

### Used in BSgenome!

fetch_GenBankAccn2seqlevel_from_NCBI <- function(assembly, AssemblyUnits=NULL)

{

assembly_report <- fetch_assembly_report(assembly,

AssemblyUnits=AssemblyUnits)

GenBank_accn <- assembly_report[["GenBankAccn"]]

if ("na" %in% GenBank_accn)

stop("GenBankAccn field in assembly report for ",

"\"", assembly, "\" contains \"na\"")

stopifnot(anyDuplicated(GenBank_accn) == 0L)

ans <- assembly_report[["SequenceName"]]

names(ans) <- GenBank_accn

ans

}

### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

### fetchExtendedChromInfoFromUCSC()

###

### 'NCBI_seqlevels' and 'NCBI_accns' must be parallel vectors.

.map_UCSC_seqlevels_to_NCBI_seqlevels <- function(UCSC_seqlevels,

NCBI_seqlevels,

NCBI_accns,

special_renamings=NULL)

{

suppressMessages(require(IRanges, quietly=TRUE)) # for elementLengths()

ans <- rep.int(NA_integer_, length(UCSC_seqlevels))

## 1. Handle special renamings.

if (!is.null(special_renamings)) {

m1 <- match(names(special_renamings), UCSC_seqlevels)

if (any(is.na(m1)))

stop("'special_renamings' has names not in 'UCSC_seqlevels'")

m2 <- match(special_renamings, NCBI_seqlevels)

if (any(is.na(m2)))

stop("'special_renamings' has values not in 'NCBI_seqlevels'")

ans[m1] <- m2

}

unmapped_idx <- which(is.na(ans))

if (length(unmapped_idx) == 0L)

return(ans)

## 2. We assign based on exact matching of the sequence names (after

## removal of the "chr" prefix).

nochr_prefix_seqlevels <- sub("^chr", "", UCSC_seqlevels[unmapped_idx])

m <- match(nochr_prefix_seqlevels, NCBI_seqlevels)

ok_idx <- which(!is.na(m))

ans[unmapped_idx[ok_idx]] <- m[ok_idx]

unmapped_idx <- which(is.na(ans))

if (length(unmapped_idx) == 0L)

return(ans)

## 3. We assign based on the number embedded in the UCSC chromosome name.

split_seqlevels <- strsplit(UCSC_seqlevels[unmapped_idx], "_")

nparts <- elementLengths(split_seqlevels)

idx2 <- which(nparts >= 2L)

if (length(idx2) != 0L) {

ans[unmapped_idx[idx2]] <- sapply(idx2,

function(i2) {

part2 <- split_seqlevels[[i2]][2L]

part2 <- sub("v", ".", part2, fixed=TRUE)

m <- grep(part2, NCBI_accns, ignore.case=TRUE)

if (length(m) >= 2L)

stop("cannot map ", UCSC_seqlevels[unmapped_idx[i2]],

" to a unique NCBI seqlevel")

if (length(m) == 0L)

return(NA_integer_)

})

}

ans

}

.standard_fetch_extended_ChromInfo_from_UCSC <- function(genome,

refseq_assembly_id,

100

AssemblyUnits,

101

special_renamings,

102

unmapped,

103

goldenPath_url)

104

{

105

chrominfo <- fetch_ChromInfo_from_UCSC(genome,

106

goldenPath_url=goldenPath_url)

107

assembly_report <- fetch_assembly_report(refseq_assembly_id,

108

AssemblyUnits=AssemblyUnits)

109

stopifnot(nrow(chrominfo) == nrow(assembly_report) + length(unmapped))

110

UCSC_seqlevels <- chrominfo$chrom

111

if (!is.null(unmapped)) {

112

unmapped_idx <- match(unmapped, UCSC_seqlevels)

113

stopifnot(!any(is.na(unmapped_idx)))

114

warning(paste(unmapped, collapse=", "),

115

": not mapped to an NCBI seqlevel")

116

}

117

NCBI_GenBankAccns <- assembly_report$GenBankAccn

118

NCBI_seqlevels <- assembly_report$SequenceName

119

m <- .map_UCSC_seqlevels_to_NCBI_seqlevels(UCSC_seqlevels,

120

NCBI_seqlevels,

121

NCBI_GenBankAccns,

122

special_renamings=special_renamings)

123

unexpectedly_unmapped_idx <-

124

which(is.na(m) & !(UCSC_seqlevels %in% unmapped))

125

if (length(unexpectedly_unmapped_idx) != 0L)

126

stop("cannot map ",

127

paste(UCSC_seqlevels[unexpectedly_unmapped_idx], collapse=", "),

128

" to an NCBI seqlevel")

129

NCBI_GenBankAccns[which(NCBI_GenBankAccns == "na")] <- NA_character_

130

data.frame(UCSC_seqlevels=UCSC_seqlevels,

131

UCSC_seqlengths=chrominfo$size,

132

NCBI_seqlevels=NCBI_seqlevels[m],

133

GenBank_accns=NCBI_GenBankAccns[m],

134

stringsAsFactors=FALSE)

135

}

136

137

.SUPPORTED_GENOMES <- list(

138

hg38=

139

list(FUN=.standard_fetch_extended_ChromInfo_from_UCSC,

140

refseq_assembly_id="GCF_000001405.26",

141

special_renamings=c(chrM="MT")),

142

hg19=

143

list(FUN=.standard_fetch_extended_ChromInfo_from_UCSC,

144

refseq_assembly_id="GCF_000001405.13",

145

## Special renaming of the 9 alternate scaffolds:

146

special_renamings=c(chr6_apd_hap1="HSCHR6_MHC_APD_CTG1",

147

chr6_cox_hap2="HSCHR6_MHC_COX_CTG1",

148

chr6_dbb_hap3="HSCHR6_MHC_DBB_CTG1",

149

chr6_mann_hap4="HSCHR6_MHC_MANN_CTG1",

150

chr6_mcf_hap5="HSCHR6_MHC_MCF_CTG1",

151

chr6_qbl_hap6="HSCHR6_MHC_QBL_CTG1",

152

chr6_ssto_hap7="HSCHR6_MHC_SSTO_CTG1",

153

chr4_ctg9_hap1="HSCHR4_1_CTG9",

154

chr17_ctg5_hap1="HSCHR17_1_CTG5"),

155

unmapped="chrM"),

156

mm10=

157

list(FUN=.standard_fetch_extended_ChromInfo_from_UCSC,

158

refseq_assembly_id="GCF_000001635.20",

159

AssemblyUnits=c("C57BL/6J", "non-nuclear"),

160

special_renamings=c(chrM="MT")),

161

162

## It's impossible to map the sequences in mm9 with the sequences in

163

## MGSCv37 because the mapping doesn't seem to be one-to-one. For example,

164

## it seems that UCSC has merged the 102 unlocalized sequences on

165

## chromosome Y into a single sequence, the chrY_random sequence.

166

## So we can't support mm9.

167

#mm9=

168

# list(FUN=.standard_fetch_extended_ChromInfo_from_UCSC,

169

# refseq_assembly_id="GCF_000001635.18",

170

# AssemblyUnits=c("C57BL/6J", "non-nuclear"),

171

# special_renamings=c(chrM="MT")),

172

dm3=

173

list(FUN=.standard_fetch_extended_ChromInfo_from_UCSC,

174

refseq_assembly_id="GCF_000001215.2",

175

special_renamings=c(chrM="MT", chrU="Un"),

176

unmapped="chrUextra"),

177

sacCer3=

178

list(FUN=.standard_fetch_extended_ChromInfo_from_UCSC,

179

refseq_assembly_id="GCF_000146045.2",

180

special_renamings=c(chrM="MT"))

181

## more to come...

182

)

183

184

.genome2idx <- function(genome)

185

{

186

refseq_assembly_id <- lookup_refseq_assembly_id(genome)

187

if (is.na(refseq_assembly_id))

188

return(NA_integer_)

189

refseq_assembly_ids <- sapply(.SUPPORTED_GENOMES,

190

`[[`, "refseq_assembly_id",

191

USE.NAMES=FALSE)

192

match(refseq_assembly_id, refseq_assembly_ids)

193

}

194

195

### Only supports UCSC genomes. However 'genome' can be:

196

### (a) a UCSC assembly name (e.g. "hg38");

197

### (b) a RefSeq Assembly ID (e.g. "GCF_000001405.26");

198

### (c) a GenBank Assembly ID (e.g. "GCA_000001405.15");

199

### (d) an NCBI assembly name (e.g. "GRCh38").

200

fetchExtendedChromInfoFromUCSC <- function(genome,

201

goldenPath_url="http://hgdownload.cse.ucsc.edu/goldenPath")

202

{

203

if (!.isSingleString(genome))

204

stop("'genome' must be a single string")

205

idx <- match(genome, names(.SUPPORTED_GENOMES))

206

if (is.na(idx)) {

207

idx <- .genome2idx(genome)

208

if (is.na(idx))

209

stop("genome \"", genome, "\" is not supported")

210

}

211

supported_genome <- .SUPPORTED_GENOMES[[idx]]

212

supported_genome$FUN(names(.SUPPORTED_GENOMES)[idx],

213

supported_genome$refseq_assembly_id,

214

supported_genome$AssemblyUnits,

215

supported_genome$special_renamings,

216

supported_genome$unmapped,

217

goldenPath_url)

218

}

219

Older »