11
bp_genbank2gff.pl - Load a Bio::DB::GFF database from GENBANK files.
15
% bp_genbank2gff.pl -d genbank -f localfile.gb
16
% bp_genbank2gff.pl -d genbank --accession AP003256
17
% bp_genbank2gff.pl --accession AP003256 --stdout
21
This script loads a Bio::DB::GFF database with the features contained
22
in a either a local genbank file or an accession that is fetched from
23
genbank. Various command-line options allow you to control which
24
database to load and whether to allow an existing database to be
27
The database must already have been created and the current user must
28
have appropriate INSERT and UPDATE privileges. The --create option
29
will initialize a new database with the appropriate schema, deleting
30
any tables that were already there.
32
=head1 COMMAND-LINE OPTIONS
34
Command-line options can be abbreviated to single-letter options.
35
e.g. -d instead of --database.
37
--create Force creation and initialization of database
38
--dsn <dsn> Data source (default dbi:mysql:test)
39
--user <user> Username for mysql authentication
40
--pass <password> Password for mysql authentication
41
--proxy <proxy> Proxy server to use for remote access
42
--stdout direct output to STDOUT
43
--adaptor <adaptor> adaptor to use (eg dbi::mysql, dbi::pg, dbi::oracle) --viral the genome you are loading is viral (changes tag
45
--source <source> source field for features ['genbank']
46
EITHER --file Arguments that follow are Genbank/EMBL file names
47
OR --gb_folder What follows is a folder full of gb files to process OR --accession Arguments that follow are genbank accession numbers
49
OR --acc_file Accession numbers (not gi!) in a file (one per line, no punc.)
50
OR --acc_pipe Accession numbers (not gi!) from a STDIN pipe (one
56
L<Bio::DB::GFF>, L<bulk_load_gff.pl>, L<load_gff.pl>
60
Scott Cain, cain@cshl.org
62
Copyright (c) 2003 Cold Spring Harbor Laboratory
64
This library is free software; you can redistribute it and/or modify
65
it under the same terms as Perl itself. See DISCLAIMER.txt for
66
disclaimers of warranty.
70
package Bio::DB::GFF::Adaptor::biofetch_to_stdout;
72
use Bio::DB::GFF::Util::Rearrange;
73
use Bio::DB::GFF::Adaptor::biofetch;
75
@ISA = 'Bio::DB::GFF::Adaptor::biofetch';
78
my ($self,$options) = @_;
79
# synthesize GFF3-compatible line
81
if (my $id = $options->{gname}) {
83
$parent =~ s/\..\d+$// if $options->{method} =~ /^(mRNA|transcript|exon|gene)$/;
84
push @attributes,"Parent=".escape($parent) if $options->{method} =~ /^(variation|exon|CDS|transcript|mRNA|coding)$/;
85
push @attributes,"ID=".escape($id) unless $options->{method} =~ /^(exon|CDS)$/;
87
if (my $tstart = $options->{tstart}) {
88
my $tstop = $options->{tstop};
89
my $target = escape($options->{gname});
90
push @attributes,"Target=$target+$tstart+$tstop";
93
if (my $attributes = $options->{attributes}) {
94
for my $a (@$attributes) {
95
my ($tag,$value) = @$a;
96
push @{$a{escape($tag)}},escape($value);
99
push @attributes,"$a=".join(',',@{$a{$a}});
102
${$options}{'score'} = "." unless ${$options}{'score'};
103
${$options}{'strand'} = "." unless ${$options}{'strand'};
104
${$options}{'phase'} = "." unless ${$options}{'phase'};
105
my $last_column = join ';',@attributes;
106
if ($options->{method} eq 'origin') {
107
print "##sequence-region $options->{gname} $options->{start} $options->{stop}\n";
109
print join("\t",@{$options}{qw(ref source method start stop score strand phase)},$last_column),"\n";
112
sub load_sequence_string {
116
$seq =~ s/(.{1,60})/$1\n/g;
117
print ">$acc\n\L$seq\U\n";
122
print "##gff-version 3\n";
133
Usage: $0 [options] [<gff file 1> <gff file 2>] ...
134
Load a Bio::DB::GFF database from GFF files.
137
--create Force creation and initialization of database
138
--dsn <dsn> Data source (default dbi:mysql:test)
139
--user <user> Username for mysql authentication
140
--pass <password> Password for mysql authentication
141
--proxy <proxy> Proxy server to use for remote access
142
--stdout direct output to STDOUT
143
--adaptor <adaptor> adaptor to use (eg dbi::mysql, dbi::pg, dbi::oracle)
144
--viral the genome you are loading is viral (changes tag
146
--source <source> source field for features ['genbank']
147
EITHER --file Arguments that follow are Genbank/EMBL file names
148
OR --gb_folder What follows is a folder full of gb files to process
149
OR --accession Arguments that follow are genbank accession numbers
151
OR --acc_file Accession numbers (not gi!) in a file (one per line,
153
OR --acc_pipe Accession numbers (not gi!) from a STDIN pipe (one
157
This script loads a Bio::DB::GFF database with the features contained
158
in a either a local genbank file or an accession that is fetched from
159
genbank. Various command-line options allow you to control which
160
database to load and whether to allow an existing database to be
166
my ($DSN,$ADAPTOR,$CREATE,$USER,$VIRAL,$PASSWORD,$gbFOLDER,
167
$FASTA,$ACC,$accFILE, $accPIPE, $FILE,$PROXY,$STDOUT,$SOURCE);
173
'password:s' => \$PASSWORD,
174
'adaptor:s' => \$ADAPTOR,
175
'accession' => \$ACC,
178
'acc_file' => \$accFILE,
179
'acc_pipe' => \$accPIPE,
180
'source:s' => \$SOURCE,
181
'gb_folder=s' => \$gbFOLDER,
182
'proxy:s' => \$PROXY,
183
'stdout' => \$STDOUT,
184
'create' => \$CREATE) or die $USAGE;
187
die $USAGE unless ($DSN || $STDOUT); # at a minimum we need to have a place to write to!
189
# some local defaults
190
$DSN ||= 'dbi:mysql:test';
191
$ADAPTOR ||= $STDOUT ? 'memory' : 'dbi::mysql';
193
# Ensure that biofetch inherits from the "right" adaptor.
194
# This is a horrible hack and should be fixed.
195
eval "use Bio::DB::GFF::Adaptor::${ADAPTOR}";
196
local @Bio::DB::GFF::Adaptor::biofetch::ISA = "Bio::DB::GFF::Adaptor::${ADAPTOR}";
198
my $biofetch = $STDOUT ? 'biofetch_to_stdout' : 'biofetch';
199
my @dsn = $STDOUT ? () : (-dsn => $DSN);
202
push @auth,(-user=>$USER) if defined $USER;
203
push @auth,(-pass=>$PASSWORD) if defined $PASSWORD;
204
push @auth,(-proxy=>$PROXY) if defined $PROXY;
206
my %preferred_tags = (
215
$preferred_tags{'product'} = 90 if $VIRAL; # added this to the default list for viral genomes
216
# since most functions come from post-translational processing, so the default labels are c**p!
218
my $db = Bio::DB::GFF->new(-adaptor=>$biofetch,
221
-preferred_tags => \%preferred_tags,
222
-source=> $SOURCE || 'Genbank')
223
or die "Can't open database: ",Bio::DB::GFF->error,"\n";
229
die "you must specify either an accession to retrieve from\nembl or a local file containing data in embl format\n" if (($FILE || $ACC) && !scalar(@ARGV));
233
status(loading => $_);
234
my $result = $db->load_from_embl(/^NC_/?'refseq':'embl' => $_);
235
status(done => $result);
242
status('loading' => $_);
243
my $result = $db->load_from_file($_);
244
status (done => $result);
250
my $filename = shift;
251
die "you must supply a filename after the --accFILE command line flag\n" unless $filename;
252
die "file $filename does not exist\n" unless (-e $filename && !(-d $filename));
253
open IN, "$filename" || die "Can't open file $filename for reading accession numbers: $!\n";
256
status(loading => $_);
257
my $result = $db->load_from_embl(/^NC_/?'refseq':'embl' => $_);
258
status(done => $result);
265
die "folder $dir does not exist\n" unless (-e $dir && -d $dir);
266
opendir DIR, "$dir" || die "can't open directory $dir for reading: $!\n";
267
my @files = readdir DIR;
268
foreach my $file(@files){
269
if (!(-e "$gbFOLDER/$file") || (-d "$gbFOLDER/$file")){
270
print STDERR " $gbFOLDER/$file is not a filename! Skipping...\n";
273
my $result = $db->load_from_file("$gbFOLDER/$file");
274
print STDERR $result ? "ok\n" : "failed\n";
277
my @accessions = <STDIN>;
279
foreach (@accessions){
280
status(loading => $_);
281
my $result = $db->load_from_embl(/^NC_/?'refseq':'embl' => $_);
282
status(done => $result);
291
status(loading => $_);
292
my $result = $db->load_from_file($_);
293
status(done => $result);
296
$done || die "\n\nno source of data provided\n\n";
301
my ($state,$msg) = @_;
303
if ($state eq 'loading') {
304
print STDERR "Loading $msg...";
305
} elsif ($state eq 'done') {
306
print STDERR $msg ? "ok\n" : "failed\n";