2
# $Id: extract_genes.pl,v 1.4 2006/02/28 02:52:11 bosborne Exp $
7
extract_genes.pl - extract genomic sequences from NCBI files
12
This script is a simple solution to the problem of
13
extracting genomic regions corresponding to genes. There are other
14
solutions, this particular approach uses genomic sequence
15
files from NCBI and gene coordinates from Entrez Gene.
17
The first time this script is run it will be slow as it will
18
extract species-specific data from the gene2accession file and create
19
a storable hash (retrieving the positional data from this hash is
20
significantly faster than reading gene2accession each time the script
21
runs). The subsequent runs should be fast.
27
Install BioPerl, full instructions at http://bioperl.org.
29
=head2 Download gene2accession.gz
31
Download this file from ftp://ftp.ncbi.nlm.nih.gov/gene/DATA into
32
your working directory and gunzip it.
34
=head2 Download sequence files
36
Create one or more species directories in the working directory, the
37
directory names do not have to match those at NCBI (e.g. "Sc", "Hs").
39
Download the nucleotide fasta files for a given species from its CHR*
40
directories at ftp://ftp.ncbi.nlm.nih.gov/genomes and put these files into a
41
species directory. The sequence files will have the suffix ".fna" or
42
"fa.gz", gunzip if necessary.
44
=head2 Determine Taxon id
46
Determine the taxon id for the given species. This id is the first column
47
in the gene2accession file. Modify the %species hash in this script
48
such that name of your species directory is a key and the taxon id is the
51
=head2 Command-line options
54
-s Name of species directory
59
extract_genes.pl -i 850302 -s Sc
68
my %species = ( "Sc" => 4932, # Saccharomyces cerevisiae
69
"Ec" => 83333, # Escherichia coli K12
70
"Hs" => 9606 # H. sapiens
75
GetOptions( "s=s" => \$name,
79
usage() if ($help || !$id || !$name);
81
my $storedHash = $name . ".dump";
83
# create index for a directory of fasta files
84
my $db = Bio::DB::Fasta->new($name, -makeid => \&make_my_id);
86
# extract species-specific data from gene2accession
87
unless (-e $storedHash) {
89
# extract species-specific information from gene2accession
90
open MYIN,"gene2accession" or die "No gene2accession file\n";
92
my @arr = split "\t",$_;
93
if ($arr[0] == $species{$name} && $arr[9] =~ /\d+/ && $arr[10] =~ /\d+/) {
94
($ref->{$arr[1]}->{"start"}, $ref->{$arr[1]}->{"end"},
95
$ref->{$arr[1]}->{"strand"}, $ref->{$arr[1]}->{"id"}) =
96
($arr[9], $arr[10], $arr[11], $arr[7]);
99
# save species-specific information using Storable
100
store $ref, $storedHash;
103
# retrieve the species-specific data from a stored hash
104
my $ref = retrieve($storedHash);
106
# retrieve sequence and sub-sequence
107
if (defined $ref->{$id}) {
108
my $chr = $db->get_Seq_by_id($ref->{$id}->{"id"});
109
my $seq = $chr->trunc($ref->{$id}->{"start"},$ref->{$id}->{"end"});
110
$seq = $seq->revcom if ($ref->{$id}->{"strand"} eq "-");
112
# Insert SeqIO options here...
113
print $seq->seq,"\n";
115
print "Cannot find id: $id\n";
120
$line =~ /ref\|([^|]+)/;