5
# use lib './blib/lib';
11
use Bio::DB::GFF::Util::Binning 'bin';
13
use constant MYSQL => 'mysql';
14
use constant FDATA => 'fdata';
15
use constant FTYPE => 'ftype';
16
use constant FGROUP => 'fgroup';
17
use constant FDNA => 'fdna';
18
use constant FATTRIBUTE => 'fattribute';
19
use constant FATTRIBUTE_TO_FEATURE => 'fattribute_to_feature';
23
bp_bulk_load_gff.pl - Bulk-load a Bio::DB::GFF database from GFF files.
27
% bp_bulk_load_gff.pl -d testdb dna1.fa dna2.fa features1.gff features2.gff ...
31
This script loads a Bio::DB::GFF database with the features contained
32
in a list of GFF files and/or FASTA sequence files. You must use the
33
exact variant of GFF described in L<Bio::DB::GFF>. Various
34
command-line options allow you to control which database to load and
35
whether to allow an existing database to be overwritten.
37
This script differs from bp_load_gff.pl in that it is hard-coded to use
38
MySQL and cannot perform incremental loads. See L<bp_load_gff.pl> for an
39
incremental loader that works with all databases supported by
40
Bio::DB::GFF, and L<bp_fast_load_gff.pl> for a MySQL loader that supports
41
fast incremental loads.
45
If the filename is given as "-" then the input is taken from standard
46
input. Compressed files (.gz, .Z, .bz2) are automatically
49
FASTA format files are distinguished from GFF files by their filename
50
extensions. Files ending in .fa, .fasta, .fast, .seq, .dna and their
51
uppercase variants are treated as FASTA files. Everything else is
52
treated as a GFF file. If you wish to load -fasta files from STDIN,
53
then use the -f command-line swith with an argument of '-', as in
55
gunzip my_data.fa.gz | bp_fast_load_gff.pl -d test -f -
57
The nature of the bulk load requires that the database be on the local
58
machine and that the indicated user have the "file" privilege to load
59
the tables and have enough room in /usr/tmp (or whatever is specified
60
by the \$TMPDIR environment variable), to hold the tables transiently.
62
Local data may now be uploaded to a remote server via the --local option
63
with the database host specified in the dsn, e.g. dbi:mysql:test:db_host
65
The adaptor used is dbi::mysqlopt. There is currently no way to
68
About maxfeature: the default value is 100,000,000 bases. If you have
69
features that are close to or greater that 100Mb in length, then the
70
value of maxfeature should be increased to 1,000,000,000. This value
71
must be a power of 10.
73
Note that Windows users must use the --create option.
75
If the list of GFF or fasta files exceeds the kernel limit for the
76
maximum number of command-line arguments, use the
77
--long_list /path/to/files option.
80
=head1 COMMAND-LINE OPTIONS
82
Command-line options can be abbreviated to single-letter options.
83
e.g. -d instead of --database.
85
--database <dsn> Database name (default dbi:mysql:test)
86
--adaptor Adaptor name (default mysql)
87
--create Reinitialize/create data tables without asking
88
--user Username to log in as
89
--fasta File or directory containing fasta files to load
90
--long_list Directory containing a very large number of
91
GFF and/or FASTA files
92
--password Password to use for authentication
93
(Does not work with Postgres, password must be
94
supplied interactively or be left empty for
96
--maxbin Set the value of the maximum bin size
97
--local Flag to indicate that the data source is local
98
--maxfeature Set the value of the maximum feature size (power of 10)
99
--group A list of one or more tag names (comma or space separated)
100
to be used for grouping in the 9th column.
101
--gff3_munge Activate GFF3 name munging (see Bio::DB::GFF)
102
--summary Generate summary statistics for drawing coverage histograms.
103
This can be run on a previously loaded database or during
105
--Temporary Location of a writable scratch directory
109
L<Bio::DB::GFF>, L<fast_load_gff.pl>, L<load_gff.pl>
113
Lincoln Stein, lstein@cshl.org
115
Copyright (c) 2002 Cold Spring Harbor Laboratory
117
This library is free software; you can redistribute it and/or modify
118
it under the same terms as Perl itself. See DISCLAIMER.txt for
119
disclaimers of warranty.
123
package Bio::DB::GFF::Adaptor::fauxmysql;
125
use Bio::DB::GFF::Adaptor::dbi::mysqlopt;
127
@ISA = 'Bio::DB::GFF::Adaptor::dbi::mysqlopt';
129
sub insert_sequence {
131
my ($id,$offset,$seq) = @_;
132
print join("\t",$id,$offset,$seq),"\n";
135
package Bio::DB::GFF::Adaptor::fauxmysqlcmap;
137
use Bio::DB::GFF::Adaptor::dbi::mysqlcmap;
139
@ISA = 'Bio::DB::GFF::Adaptor::dbi::mysqlcmap';
141
sub insert_sequence {
143
my ($id,$offset,$seq) = @_;
144
print join("\t",$id,$offset,$seq),"\n";
147
package Bio::DB::GFF::Adaptor::fauxpg;
149
use Bio::DB::GFF::Adaptor::dbi::pg;
151
@ISA = 'Bio::DB::GFF::Adaptor::dbi::pg';
153
#these two subs are to separate the table creation from the
158
$self->drop_all if $erase;
160
my $dbh = $self->features_db;
161
my $schema = $self->schema;
162
foreach my $table_name ($self->tables) {
163
my $create_table_stmt = $schema->{$table_name}{table} ;
164
$dbh->do($create_table_stmt) || warn $dbh->errstr;
165
# $self->create_other_schema_objects(\%{$schema->{$table_name}});
170
sub _create_indexes_etc {
173
my $dbh = $self->features_db;
174
my $schema = $self->schema;
175
foreach my $table_name ($self->tables) {
176
$self->create_other_schema_objects(\%{$schema->{$table_name}});
180
sub insert_sequence {
182
my ($id,$offset,$seq) = @_;
183
print "$id\t$offset\t$seq\n";
188
eval "use Time::HiRes"; undef $@;
189
my $timer = defined &Time::HiRes::time;
191
my $bWINDOWS = 0; # Boolean: is this a MSWindows operating system?
192
if ($^O =~ /MSWin32/i) {
196
my ($DSN,$ADAPTOR,$FORCE,$USER,$PASSWORD,$FASTA,$LOCAL,$MAX_BIN,$GROUP_TAG,$LONG_LIST,$MUNGE,$TMPDIR);
198
GetOptions ('database:s' => \$DSN,
199
'adaptor:s' => \$ADAPTOR,
202
'password:s' => \$PASSWORD,
203
'fasta:s' => \$FASTA,
205
'maxbin|maxfeature:s' => \$MAX_BIN,
206
'group:s' => \$GROUP_TAG,
207
'long_list:s' => \$LONG_LIST,
208
'gff3_munge' => \$MUNGE,
209
'Temporary:s' => \$TMPDIR,
210
) or (system('pod2text', $0), exit -1);
212
# If called as pg_bulk_load_gff.pl behave as that did.
213
if ($0 =~/pg_bulk_load_gff.pl/){
217
$DSN ||= 'dbi:mysql:test';
218
$MAX_BIN ||= 1_000_000_000; # to accomodate human-sized chromosomes
221
if ($bWINDOWS && not $FORCE) {
222
die "Note that Windows users must use the --create option.\n";
226
die "This will delete all existing data in database $DSN. If you want to do this, rerun with the --create option.\n"
228
open (TTY,"/dev/tty") or die "/dev/tty: $!\n"; #TTY use removed for win compatability
229
print STDERR "This operation will delete all existing data in database $DSN. Continue? ";
231
die "Aborted\n" unless $f =~ /^[yY]/;
235
# postgres DBD::Pg allows 'database', but also 'dbname', and 'db':
236
# and it must be Pg (not pg)
237
$DSN=~s/pg:database=/Pg:/i;
238
$DSN=~s/pg:dbname=/Pg:/i;
239
$DSN=~s/pg:db=/Pg:/i;
241
# leave these lines for mysql
242
$DSN=~s/database=//i;
243
$DSN=~s/;host=/:/i; #cater for dsn in the form of "dbi:mysql:database=$dbname;host=$host"
246
my($DBI,$DBD,$DBNAME,$HOST)=split /:/,$DSN;
247
$DBNAME=$DSN unless $DSN=~/:/;
249
$ADAPTOR ||= 'mysql';
252
# rebuild DSN, DBD::Pg requires full dbname=<name> format
253
$DSN = "dbi:Pg:dbname=$DBNAME";
254
if ($HOST) { $DSN .= ";host=$HOST"; }
257
my ($use_mysql,$use_mysqlcmap,$use_pg) = (0,0,0);
258
if ( $ADAPTOR eq 'mysqlcmap' ) {
261
elsif ( $ADAPTOR =~ /^mysql/ ) {
264
elsif ( $ADAPTOR eq "Pg" ) {
268
die "$ADAPTOR is not an acceptable database adaptor.";
274
push @auth,(-user=>$USER);
275
if ( $use_mysql or $use_mysqlcmap ) {
279
$AUTH .= " -U $USER ";
282
if (defined $PASSWORD) {
283
push @auth,(-pass=>$PASSWORD);
284
if ( $use_mysql or $use_mysqlcmap ) {
285
$AUTH .= " -p$PASSWORD";
287
# elsif ( $use_pg ) {
288
# $AUTH .= " -W $PASSWORD ";
295
if (defined $DBNAME) {
296
if ( $use_mysql or $use_mysqlcmap ) {
297
$AUTH .= " -D$DBNAME ";
300
if (defined $LOCAL) {
302
$AUTH.=' --local-infile=1';
308
if ( $use_mysqlcmap ) {
309
$faux_adaptor = "fauxmysqlcmap";
311
elsif ( $use_mysql ) {
312
$faux_adaptor = "fauxmysql";
315
$faux_adaptor = "fauxpg";
318
my $db = Bio::DB::GFF->new(-adaptor=>$faux_adaptor,-dsn => $DSN,@auth)
319
or die "Can't open database: ",Bio::DB::GFF->error,"\n";
321
$db->gff3_name_munging(1) if $MUNGE;
323
$MAX_BIN ? $db->initialize(-erase=>1,-MAX_BIN=>$MAX_BIN) : $db->initialize(1);
324
$MAX_BIN ||= $db->meta('max_bin') || 100_000_000;
326
# deal with really long lists of files
328
-d $LONG_LIST or die "The --long_list argument must be a directory\n";
329
opendir GFFDIR,$LONG_LIST or die "Could not open $LONG_LIST for reading: $!";
330
@ARGV = map { "$LONG_LIST\/$_" } readdir GFFDIR;
333
if (defined $FASTA && -d $FASTA) {
334
opendir FASTA,$FASTA or die "Could not open $FASTA for reading: $!";
335
push @ARGV, map { "$FASTA\/$_" } readdir FASTA;
338
elsif (defined $FASTA && -f $FASTA) {
344
$_ = "gunzip -c $_ |" if /\.gz$/;
345
$_ = "uncompress -c $_ |" if /\.Z$/;
346
$_ = "bunzip2 -c $_ |" if /\.bz2$/;
351
if (/\.(fa|fasta|dna|seq|fast)(?:$|\.)/i) {
358
push @fasta,$FASTA if defined $FASTA;
360
# drop everything that was there before
362
my $tmpdir = File::Spec->tmpdir() || '/tmp';
363
$tmpdir =~ s!\\!\\\\!g if $bWINDOWS; #eliminates backslash mis-interpretation
364
-d $tmpdir or die <<END;
365
I could not find a suitable temporary directory to write scratch files into ($tmpdir by default).
366
Please select a directory and indicate its location by setting the TMP environment variable, or
367
by using the --Temporary switch.
369
my @fasta_files_to_be_unlinked;
370
my @files = (FDATA,FTYPE,FGROUP,FDNA,FATTRIBUTE,FATTRIBUTE_TO_FEATURE);
372
$FH{$_} = IO::File->new(">$tmpdir/$_.$$") or die $_,": $!";
377
$FH{FDATA() }->print("COPY fdata (fid, fref, fstart, fstop, fbin, ftypeid, fscore, fstrand, fphase, gid, ftarget_start, ftarget_stop) FROM stdin;\n");
378
$FH{FTYPE() }->print("COPY ftype (ftypeid, fmethod, fsource) FROM stdin;\n");
379
$FH{FGROUP() }->print("COPY fgroup (gid, gclass, gname) FROM stdin;\n");
380
$FH{FATTRIBUTE() }->print("COPY fattribute (fattribute_id, fattribute_name) FROM stdin;\n");
381
$FH{FATTRIBUTE_TO_FEATURE()}->print("COPY fattribute_to_feature (fid, fattribute_id, fattribute_value) FROM stdin;\n");
389
my %ATTRIBUTEID = ();
393
my %tmpfiles; # keep track of temporary fasta files
395
my $fasta_sequence_id;
397
my $current_file; #used to reset GFF3 flag in mix of GFF and GFF3 files
399
$db->preferred_groups(split (/[,\s]+/,$GROUP_TAG)) if defined $GROUP_TAG;
401
my $last = Time::HiRes::time() if $timer;
404
# avoid hanging on standalone --fasta load
406
$FH{NULL} = IO::File->new(">$tmpdir/null");
407
push @ARGV, "$tmpdir/null";
414
FetchHashKeyName => 'NAME_lc',
420
$cmap_db = DBI->connect( $DSN, $USER, $PASSWORD, $options );
422
# Only load CMap::Utils if using cmap
423
unless (!$use_mysqlcmap or
425
require Bio::GMOD::CMap::Utils;
426
Bio::GMOD::CMap::Utils->import('next_number');
430
print STDERR "Error loading Bio::GMOD::CMap::Utils\n";
436
$current_file ||= $ARGV;
438
# reset GFF3 flag if new filehandle
439
unless($current_file eq $ARGV){
441
$current_file = $ARGV;
445
my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group);
447
# close sequence filehandle if required
448
if ( /^\#|\s+|^$|^>|\t/ && defined $FH{FASTA}) {
453
# print to fasta file if the handle is open
454
if ( defined $FH{FASTA} ) {
455
$FH{FASTA}->print("$_\n");
459
elsif (/^>(\S+)/) { # uh oh, sequence coming
460
$FH{FASTA} = IO::File->new(">$tmpdir/$1\.fa") or die "FASTA: $!\n";
461
$FH{FASTA}->print("$_\n");
462
print STDERR "Preparing embedded sequence $1\n";
463
push @fasta, "$tmpdir/$1\.fa";
464
push @fasta_files_to_be_unlinked,"$tmpdir/$1\.fa";
465
$tmpfiles{"$tmpdir/$1\.fa"}++;
469
elsif (/^\#\#\s*gff-version\s+(\d+)/) {
471
$db->print_gff3_warning() if $gff3;
475
elsif (/^\#\#\s*group-tags\s+(.+)/) {
476
$db->preferred_groups(split(/\s+/,$1));
480
elsif (/^\#\#\s*sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/i) { # header line
481
($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) =
482
($1,'reference','Component',$2,$3,'.','.','.',$gff3 ? "ID=Sequence:$1": qq(Sequence "$1"));
490
($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t";
492
if ( not defined( $ref ) or length ($ref) == 0) {
493
warn "\$ref is null. source = $source, method = $method, group = $group\n";
497
my $size = $stop-$start+1;
498
warn "Feature $group ($size) is larger than $MAX_BIN. You will have trouble retrieving this feature.\nRerun script with --maxfeature set to a higher power of 10.\n" if $size > $MAX_BIN;
500
$source = '\N' unless defined $source;
501
$score = '\N' if $score eq '.';
502
$strand = '\N' if $strand eq '.';
503
$phase = '\N' if $phase eq '.';
505
my ($group_class,$group_name,$target_start,$target_stop,$attributes) = $db->split_group($group,$gff3);
508
$group_class = [$group_class] unless ref $group_class;
509
$group_name = [$group_name] unless ref $group_name;
511
for (my $i=0; $i < @$group_name; $i++) {
512
$group_class->[$i] ||= '\N';
513
$group_name->[$i] ||= '\N';
514
$target_start ||= '\N';
515
$target_stop ||= '\N';
520
my $gid = $GROUPID{lc join('',$group_class->[$i],$group_name->[$i])} ||= $GID++;
521
my $ftypeid = $FTYPEID{lc join('',$source,$method)} ||= $FTYPEID++;
523
my $bin = bin($start,$stop,$db->min_bin);
524
$FH{ FDATA() }->print( join("\t",$fid,$ref,$start,$stop,$bin,$ftypeid,$score,$strand,$phase,$gid,$target_start,$target_stop),"\n" );
526
my $feature_id = next_number(
528
table_name => 'cmap_feature',
529
id_field => 'feature_id',
531
or die 'No feature id';
532
my $direction = $strand eq '-' ? -1:1;
533
$FH{ FGROUP() }->print(
534
join("\t",$feature_id,$feature_id,'NULL',0, $group_name->[$i],0,0,'NULL',1,$direction, $group_class->[$i],)
536
) unless $DONE{"G$gid"}++;
539
$FH{ FGROUP() }->print( join("\t",$gid,$group_class->[$i],$group_name->[$i]),"\n") unless $DONE{"G$gid"}++;
541
$FH{ FTYPE() }->print( join("\t",$ftypeid,$method,$source),"\n" ) unless $DONE{"T$ftypeid"}++;
543
foreach (@$attributes) {
544
my ($key,$value) = @$_;
545
my $attributeid = $ATTRIBUTEID{$key} ||= $ATTRIBUTEID++;
546
$FH{ FATTRIBUTE() }->print( join("\t",$attributeid,$key),"\n" ) unless $DONE{"A$attributeid"}++;
547
$FH{ FATTRIBUTE_TO_FEATURE() }->print( join("\t",$fid,$attributeid,$value),"\n");
550
if ( $fid % 1000 == 0) {
551
my $now = Time::HiRes::time() if $timer;
552
my $elapsed = $timer ? sprintf(" in %5.2fs",$now - $last) : '';
554
print STDERR "$fid features parsed$elapsed...";
555
print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";
560
$FH{FASTA}->close if exists $FH{FASTA};
562
for my $file (@fasta) {
563
warn "Preparing DNA file $file....\n";
565
$FH{FDNA() }->print("COPY fdna (fref, foffset, fdna) FROM stdin;\n");
567
my $old = select($FH{FDNA()});
568
$db->load_fasta($file) or warn "Couldn't load fasta file $file: $!";
570
$FH{FDNA() }->print("\\.\n\n");
574
unlink $file if $tmpfiles{$file};
578
$FH{FDATA() }->print("\\.\n\n");
579
$FH{FTYPE() }->print("\\.\n\n");
580
$FH{FGROUP() }->print("\\.\n\n");
581
$FH{FATTRIBUTE() }->print("\\.\n\n");
582
$FH{FATTRIBUTE_TO_FEATURE()}->print("\\.\n\n");
586
$_->close foreach values %FH;
587
printf STDERR "Total parse time %5.2fs\n",(Time::HiRes::time() - $start) if $timer;
588
warn "Loading feature data and analyzing tables. You may see RDBMS messages here...\n";
591
warn "Loading feature data. You may see Postgres comments...\n";
594
my $file = "$tmpdir/$_.$$";
596
$AUTH ? system("psql $AUTH -f $file $DBNAME")
597
: system('psql','-f', $file, $DBNAME);
602
warn "Updating sequences ...\n";
603
$db->update_sequences();
605
warn "Creating indexes ...\n";
606
$db->_create_indexes_etc();
612
elsif( $use_mysql or $use_mysqlcmap ) {
616
my $TERMINATEDBY = $bWINDOWS ? q( LINES TERMINATED BY '\r\n') : '';
618
my $table = function_to_table($f,$ADAPTOR);
619
my $sql = join ('; ',
620
"lock tables $table write",
621
"delete from $table",
622
"load data $LOCAL infile '$tmpdir/$f.$$' replace into table $table $TERMINATEDBY",
624
my $command = MYSQL . qq[$AUTH -s -e "$sql"];
625
$command =~ s/\n/ /g;
626
$success &&= system($command) == 0;
627
unlink "$tmpdir/$f.$$";
629
printf STDERR "Total load time %5.2fs\n",(time() - $start) if $timer;
630
print STDERR "done...\n";
632
print STDERR "Analyzing/optimizing tables. You will see database messages...\n";
636
my $table = function_to_table($f,$ADAPTOR);
637
$sql .= "analyze table $table;";
639
my $command = MYSQL . qq[$AUTH -N -s -e "$sql"];
640
$success &&= system($command) == 0;
641
printf STDERR "Optimization time time %5.2fs\n",(time() - $start);
644
print "$FEATURES features successfully loaded\n";
646
print "FAILURE: Please see standard error for details\n";
651
foreach (@fasta_files_to_be_unlinked) {
652
unlink "$tmpdir/$_.$$";
655
warn "Building summary statistics for coverage histograms...\n";
658
push @args,(-user=>$USER);
661
if (defined $PASSWORD) {
662
push @args,(-pass=>$PASSWORD);
663
$AUTH .= " -p$PASSWORD";
665
push @args,(-preferred_groups=>[split(/[,\s+]+/,$GROUP_TAG)]) if defined $GROUP_TAG;
666
my $db = Bio::DB::GFF->new(-adaptor=>"dbi::$ADAPTOR",-dsn => $DSN,@args)
667
or die "Can't open database: ",Bio::DB::GFF->error,"\n";
668
$db->build_summary_statistics;
672
sub function_to_table {
673
my $function = shift;
676
if ($function eq 'fdata'){
679
elsif ($function eq 'ftype'){
682
elsif ($function eq 'fgroup'){
683
return 'cmap_feature' if ($adaptor eq 'mysqlcmap');
686
elsif ($function eq 'fdna'){
689
elsif ($function eq 'fattribute'){
692
elsif ($function eq 'fattribute_to_feature'){
693
return 'fattribute_to_feature';