5
# $Id: bp_genbank2gff.PLS,v 1.14 2004/05/05 13:37:12 scain Exp $
12
bp_genbank2gff.pl - Load a Bio::DB::GFF database from GENBANK files.
16
% bp_genbank2gff.pl -d genbank -f localfile.gb
17
% bp_genbank2gff.pl -d genbank --accession AP003256
18
% bp_genbank2gff.pl --accession AP003256 --stdout
22
This script loads a Bio::DB::GFF database with the features contained
23
in a either a local genbank file or an accession that is fetched from
24
genbank. Various command-line options allow you to control which
25
database to load and whether to allow an existing database to be
28
The database must already have been created and the current user must
29
have appropriate INSERT and UPDATE privileges. The --create option
30
will initialize a new database with the appropriate schema, deleting
31
any tables that were already there.
33
=head1 COMMAND-LINE OPTIONS
35
Command-line options can be abbreviated to single-letter options.
36
e.g. -d instead of --database.
38
--create Force creation and initialization of database
39
--dsn <dsn> Data source (default dbi:mysql:test)
40
--user <user> Username for mysql authentication
41
--pass <password> Password for mysql authentication
42
--proxy <proxy> Proxy server to use for remote access
43
--stdout direct output to STDOUT
44
--adaptor <adaptor> adaptor to use (eg dbi::mysql, dbi::pg, dbi::oracle) --viral the genome you are loading is viral (changes tag
46
--source <source> source field for features ['genbank']
47
EITHER --file Arguments that follow are Genbank/EMBL file names
48
OR --gb_folder What follows is a folder full of gb files to process OR --accession Arguments that follow are genbank accession numbers
50
OR --acc_file Accession numbers (not gi!) in a file (one per line, no punc.)
51
OR --acc_pipe Accession numbers (not gi!) from a STDIN pipe (one
57
L<Bio::DB::GFF>, L<bulk_load_gff.pl>, L<load_gff.pl>
61
Scott Cain, cain@cshl.org
63
Copyright (c) 2003 Cold Spring Harbor Laboratory
65
This library is free software; you can redistribute it and/or modify
66
it under the same terms as Perl itself. See DISCLAIMER.txt for
67
disclaimers of warranty.
71
package Bio::DB::GFF::Adaptor::biofetch_to_stdout;
73
use Bio::DB::GFF::Util::Rearrange;
74
use Bio::DB::GFF::Adaptor::biofetch;
76
@ISA = 'Bio::DB::GFF::Adaptor::biofetch';
79
my ($self,$options) = @_;
80
# synthesize GFF3-compatible line
82
if (my $id = $options->{gname}) {
84
$parent =~ s/\..\d+$// if $options->{method} =~ /^(mRNA|transcript|exon|gene)$/;
85
push @attributes,"Parent=".escape($parent) if $options->{method} =~ /^(variation|exon|CDS|transcript|mRNA|coding)$/;
86
push @attributes,"ID=".escape($id) unless $options->{method} =~ /^(exon|CDS)$/;
88
if (my $tstart = $options->{tstart}) {
89
my $tstop = $options->{tstop};
90
my $target = escape($options->{gname});
91
push @attributes,"Target=$target+$tstart+$tstop";
94
if (my $attributes = $options->{attributes}) {
95
for my $a (@$attributes) {
96
my ($tag,$value) = @$a;
97
push @{$a{escape($tag)}},escape($value);
100
push @attributes,"$a=".join(',',@{$a{$a}});
103
${$options}{'score'} = "." unless ${$options}{'score'};
104
${$options}{'strand'} = "." unless ${$options}{'strand'};
105
${$options}{'phase'} = "." unless ${$options}{'phase'};
106
my $last_column = join ';',@attributes;
107
if ($options->{method} eq 'origin') {
108
print "##sequence-region $options->{gname} $options->{start} $options->{stop}\n";
110
print join("\t",@{$options}{qw(ref source method start stop score strand phase)},$last_column),"\n";
113
sub load_sequence_string {
117
$seq =~ s/(.{1,60})/$1\n/g;
118
print ">$acc\n\L$seq\U\n";
123
print "##gff-version 3\n";
134
Usage: $0 [options] [<gff file 1> <gff file 2>] ...
135
Load a Bio::DB::GFF database from GFF files.
138
--create Force creation and initialization of database
139
--dsn <dsn> Data source (default dbi:mysql:test)
140
--user <user> Username for mysql authentication
141
--pass <password> Password for mysql authentication
142
--proxy <proxy> Proxy server to use for remote access
143
--stdout direct output to STDOUT
144
--adaptor <adaptor> adaptor to use (eg dbi::mysql, dbi::pg, dbi::oracle)
145
--viral the genome you are loading is viral (changes tag
147
--source <source> source field for features ['genbank']
148
EITHER --file Arguments that follow are Genbank/EMBL file names
149
OR --gb_folder What follows is a folder full of gb files to process
150
OR --accession Arguments that follow are genbank accession numbers
152
OR --acc_file Accession numbers (not gi!) in a file (one per line,
154
OR --acc_pipe Accession numbers (not gi!) from a STDIN pipe (one
158
This script loads a Bio::DB::GFF database with the features contained
159
in a either a local genbank file or an accession that is fetched from
160
genbank. Various command-line options allow you to control which
161
database to load and whether to allow an existing database to be
167
my ($DSN,$ADAPTOR,$CREATE,$USER,$VIRAL,$PASSWORD,$gbFOLDER,
168
$FASTA,$ACC,$accFILE, $accPIPE, $FILE,$PROXY,$STDOUT,$SOURCE);
174
'password:s' => \$PASSWORD,
175
'adaptor:s' => \$ADAPTOR,
176
'accession' => \$ACC,
179
'acc_file' => \$accFILE,
180
'acc_pipe' => \$accPIPE,
181
'source:s' => \$SOURCE,
182
'gb_folder=s' => \$gbFOLDER,
183
'proxy:s' => \$PROXY,
184
'stdout' => \$STDOUT,
185
'create' => \$CREATE) or die $USAGE;
188
die $USAGE unless ($DSN || $STDOUT); # at a minimum we need to have a place to write to!
190
# some local defaults
191
$DSN ||= 'dbi:mysql:test';
192
$ADAPTOR ||= $STDOUT ? 'memory' : 'dbi::mysql';
194
# Ensure that biofetch inherits from the "right" adaptor.
195
# This is a horrible hack and should be fixed.
196
eval "use Bio::DB::GFF::Adaptor::${ADAPTOR}";
197
local @Bio::DB::GFF::Adaptor::biofetch::ISA = "Bio::DB::GFF::Adaptor::${ADAPTOR}";
199
my $biofetch = $STDOUT ? 'biofetch_to_stdout' : 'biofetch';
200
my @dsn = $STDOUT ? () : (-dsn => $DSN);
203
push @auth,(-user=>$USER) if defined $USER;
204
push @auth,(-pass=>$PASSWORD) if defined $PASSWORD;
205
push @auth,(-proxy=>$PROXY) if defined $PROXY;
207
my %preferred_tags = (
216
$preferred_tags{'product'} = 90 if $VIRAL; # added this to the default list for viral genomes
217
# since most functions come from post-translational processing, so the default labels are c**p!
219
my $db = Bio::DB::GFF->new(-adaptor=>$biofetch,
222
-preferred_tags => \%preferred_tags,
223
-source=> $SOURCE || 'Genbank')
224
or die "Can't open database: ",Bio::DB::GFF->error,"\n";
230
die "you must specify either an accession to retrieve from\nembl or a local file containing data in embl format\n" if (($FILE || $ACC) && !scalar(@ARGV));
234
status(loading => $_);
235
my $result = $db->load_from_embl(/^NC_/?'refseq':'embl' => $_);
236
status(done => $result);
243
status('loading' => $_);
244
my $result = $db->load_from_file($_);
245
status (done => $result);
251
my $filename = shift;
252
die "you must supply a filename after the --accFILE command line flag\n" unless $filename;
253
die "file $filename does not exist\n" unless (-e $filename && !(-d $filename));
254
open IN, "$filename" || die "Can't open file $filename for reading accession numbers: $!\n";
257
status(loading => $_);
258
my $result = $db->load_from_embl(/^NC_/?'refseq':'embl' => $_);
259
status(done => $result);
266
die "folder $dir does not exist\n" unless (-e $dir && -d $dir);
267
opendir DIR, "$dir" || die "can't open directory $dir for reading: $!\n";
268
my @files = readdir DIR;
269
foreach my $file(@files){
270
if (!(-e "$gbFOLDER/$file") || (-d "$gbFOLDER/$file")){
271
print STDERR " $gbFOLDER/$file is not a filename! Skipping...\n";
274
my $result = $db->load_from_file("$gbFOLDER/$file");
275
print STDERR $result ? "ok\n" : "failed\n";
278
my @accessions = <STDIN>;
280
foreach (@accessions){
281
status(loading => $_);
282
my $result = $db->load_from_embl(/^NC_/?'refseq':'embl' => $_);
283
status(done => $result);
292
status(loading => $_);
293
my $result = $db->load_from_file($_);
294
status(done => $result);
297
$done || die "\n\nno source of data provided\n\n";
302
my ($state,$msg) = @_;
304
if ($state eq 'loading') {
305
print STDERR "Loading $msg...";
306
} elsif ($state eq 'done') {
307
print STDERR $msg ? "ok\n" : "failed\n";