5
# use lib './blib/lib';
9
use Bio::DB::GFF::Util::Binning 'bin';
10
use Bio::DB::GFF::Adaptor::dbi::mysqlopt;
12
use constant MYSQL => 'mysql';
14
use constant FDATA => 'fdata';
15
use constant FTYPE => 'ftype';
16
use constant FGROUP => 'fgroup';
17
use constant FDNA => 'fdna';
18
use constant FATTRIBUTE => 'fattribute';
19
use constant FATTRIBUTE_TO_FEATURE => 'fattribute_to_feature';
21
my $DO_FAST = eval "use POSIX 'WNOHANG'; 1;";
25
bp_fast_load_gff.pl - Fast-load a Bio::DB::GFF database from GFF files.
29
% bp_fast_load_gff.pl -d testdb dna1.fa dna2.fa features1.gff features2.gff ...
33
This script loads a Bio::DB::GFF database with the features contained
34
in a list of GFF files and/or FASTA sequence files. You must use the
35
exact variant of GFF described in L<Bio::DB::GFF>. Various
36
command-line options allow you to control which database to load and
37
whether to allow an existing database to be overwritten.
39
This script is similar to load_gff.pl, but is much faster. However,
40
it is hard-coded to use MySQL and probably only works on Unix
41
platforms due to its reliance on pipes. See L<bp_load_gff.pl> for an
42
incremental loader that works with all databases supported by
43
Bio::DB::GFF, and L<bp_bulk_load_gff.pl> for a fast MySQL loader that
44
supports all platforms.
48
If the filename is given as "-" then the input is taken from
49
standard input. Compressed files (.gz, .Z, .bz2) are automatically
52
FASTA format files are distinguished from GFF files by their filename
53
extensions. Files ending in .fa, .fasta, .fast, .seq, .dna and their
54
uppercase variants are treated as FASTA files. Everything else is
55
treated as a GFF file. If you wish to load -fasta files from STDIN,
56
then use the -f command-line swith with an argument of '-', as in
58
gunzip my_data.fa.gz | bp_fast_load_gff.pl -d test -f -
60
The nature of the load requires that the database be on the local
61
machine and that the indicated user have the "file" privilege to load
62
the tables and have enough room in /usr/tmp (or whatever is specified
63
by the \$TMPDIR environment variable), to hold the tables transiently.
64
If your MySQL is version 3.22.6 and was compiled using the "load local
65
file" option, then you may be able to load remote databases with local
66
data using the --local option.
68
About maxfeature: the default value is 100,000,000 bases. If you have
69
features that are close to or greater that 100Mb in length, then the
70
value of maxfeature should be increased to 1,000,000,000. This value
71
must be a power of 10.
73
If the list of GFF or fasta files exceeds the kernel limit for the
74
maximum number of command-line arguments, use the
75
--long_list /path/to/files option.
77
The adaptor used is dbi::mysqlopt. There is currently no way to
80
=head1 COMMAND-LINE OPTIONS
82
Command-line options can be abbreviated to single-letter options.
83
e.g. -d instead of --database.
85
--database <dsn> Mysql database name
86
--create Reinitialize/create data tables without asking
87
--local Try to load a remote database using local data.
88
--user Username to log in as
89
--fasta File or directory containing fasta files to load
90
--password Password to use for authentication
91
--long_list Directory containing a very large number of
92
GFF and/or FASTA files
93
--maxfeature Set the value of the maximum feature size (default 100Mb; must be a power of 10)
94
--group A list of one or more tag names (comma or space separated)
95
to be used for grouping in the 9th column.
96
--gff3_munge Activate GFF3 name munging (see Bio::DB::GFF)
97
--summary Generate summary statistics for drawing coverage histograms.
98
This can be run on a previously loaded database or during
100
--Temporary Location of a writable scratch directory
104
L<Bio::DB::GFF>, L<bulk_load_gff.pl>, L<load_gff.pl>
108
Lincoln Stein, lstein@cshl.org
110
Copyright (c) 2002 Cold Spring Harbor Laboratory
112
This library is free software; you can redistribute it and/or modify
113
it under the same terms as Perl itself. See DISCLAIMER.txt for
114
disclaimers of warranty.
118
package Bio::DB::GFF::Adaptor::faux;
120
use Bio::DB::GFF::Adaptor::dbi::mysqlopt;
122
@ISA = 'Bio::DB::GFF::Adaptor::dbi::mysqlopt';
124
sub insert_sequence {
126
my ($id,$offset,$seq) = @_;
127
print join "\t",$id,$offset,$seq,"\n";
132
eval "use Time::HiRes"; undef $@;
133
my $timer = defined &Time::HiRes::time;
135
my ($DSN,$CREATE,$USER,$PASSWORD,$FASTA,$FAILED,$LOCAL,%PID,$MAX_BIN,$GROUP_TAG,$LONG_LIST,$MUNGE,$TMPDIR,$SUMMARY_STATS);
139
while ((my $child = waitpid(-1,&WNOHANG)) > 0) {
140
delete $PID{$child} or next;
141
$FAILED++ if $? != 0;
146
$SIG{INT} = $SIG{TERM} = sub {cleanup(); exit -1};
148
GetOptions ('database:s' => \$DSN,
149
'create' => \$CREATE,
152
'password:s' => \$PASSWORD,
153
'fasta:s' => \$FASTA,
154
'group:s' => \$GROUP_TAG,
155
'long_list:s' => \$LONG_LIST,
156
'maxbin|maxfeature:s' => \$MAX_BIN,
157
'gff3_munge' => \$MUNGE,
158
'summary' => \$SUMMARY_STATS,
159
'Temporary:s' => \$TMPDIR,
160
) or (system('pod2text',$0), exit -1);
163
$MAX_BIN ||= 1_000_000_000; # to accomodate human-sized chromosomes
167
push @args,(-user=>$USER);
170
if (defined $PASSWORD) {
171
push @args,(-pass=>$PASSWORD);
172
$AUTH .= " -p$PASSWORD";
174
push @args,(-preferred_groups=>[split(/[,\s+]+/,$GROUP_TAG)]) if defined $GROUP_TAG;
176
my $db = Bio::DB::GFF->new(-adaptor=>'faux',-dsn => $DSN,@args)
177
or die "Can't open database: ",Bio::DB::GFF->error,"\n";
179
$db->gff3_name_munging(1) if $MUNGE;
183
$MAX_BIN ? $db->initialize(-erase=>1,-MAX_BIN=>$MAX_BIN) : $db->initialize(1);
186
$MAX_BIN ||= $db->meta('max_bin') || 100_000_000;
188
# deal with really long lists of files
190
-d $LONG_LIST or die "The --long_list argument must be a directory\n";
191
opendir GFFDIR,$LONG_LIST or die "Could not open $LONG_LIST for reading: $!";
192
@ARGV = map { "$LONG_LIST\/$_" } readdir GFFDIR;
195
if (defined $FASTA && -d $FASTA) {
196
opendir FASTA,$FASTA or die "Could not open $FASTA for reading: $!";
197
push @ARGV, map { "$FASTA\/$_" } readdir FASTA;
203
$_ = "gunzip -c $_ |" if /\.gz$/;
204
$_ = "uncompress -c $_ |" if /\.Z$/;
205
$_ = "bunzip2 -c $_ |" if /\.bz2$/;
209
if (/\.(fa|fasta|dna|seq|fast)(?:\.|$)/i) {
216
push @fasta,$FASTA if defined $FASTA;
218
# initialize state variables
225
my %ATTRIBUTEID = ();
229
load_tables($db->dbh) unless $CREATE;
230
my ($major,$minor,$sub) = split /\./,$db->dbh->get_info(18); # SQL_DBMS_VER
231
my $can_disable_indexes = ($major >= 4 and $minor >= 0);
233
# open up pipes to the database
236
my $tmpdir = $TMPDIR || $ENV{TMPDIR} || $ENV{TMP} || File::Spec->tmpdir();
237
-d $tmpdir or die <<END;
238
I could not find a suitable temporary directory to write scratch files into ($tmpdir by default).
239
Please select a directory and indicate its location by setting the TMP environment variable, or
240
by using the --Temporary switch.
243
my @fasta_files_to_be_unlinked;
244
my @files = (FDATA,FTYPE,FGROUP,FDNA,FATTRIBUTE,FATTRIBUTE_TO_FEATURE);
246
my $file = "$tmpdir/$_.$$";
247
print STDERR "creating load file $file...";
248
$DO_FAST &&= (system("mkfifo $file") == 0); # for system(), 0 = success
250
my $delete = $CREATE ? "delete from $_" : '';
251
my $local = $LOCAL ? 'local' : '';
252
my $analyze = "analyze table $_";
257
-e "lock tables $_ write; $delete; load data $local infile '$file' replace into table $_; unlock tables; $analyze"
261
$command =~ s/\n/ /g;
262
$COMMAND{$_} = $command;
265
if (my $pid = fork) {
267
print STDERR "pausing for 0.5 sec..." if $DO_FAST;
268
select(undef,undef,undef,0.50); # work around a race condition
270
} else { # THIS IS IN CHILD PROCESS
271
die "Couldn't fork: $!" unless defined $pid;
272
exec $command || die "Couldn't exec: $!";
276
print STDERR "opening load file for writing...";
277
$FH{$_} = IO::File->new($file,'>') or die $_,": $!";
282
print STDERR "Fast loading enabled\n" if $DO_FAST;
284
my ($count,$gff3,$last,$start,$beginning,$current_file);
286
$last = Time::HiRes::time() if $timer;
287
$beginning = $start = $last;
289
# avoid hanging on standalone --fasta load
291
$FH{NULL} = IO::File->new(">$tmpdir/null");
292
push @ARGV, "$tmpdir/null";
297
# reset GFF3 flag if new filehandle
298
$current_file ||= $ARGV;
299
unless ($current_file eq $ARGV) {
301
$current_file = $ARGV;
305
my ($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group);
307
# close sequence filehandle if required
308
if ( /^\#|\s+|^$|^>|\t/ && defined $FH{FASTA}) {
313
# print to fasta file if the handle is open
314
if ( defined $FH{FASTA} ) {
315
$FH{FASTA}->print("$_\n");
319
elsif (/^>(\S+)/) { # uh oh, sequence coming
320
$FH{FASTA} = IO::File->new(">$tmpdir/$1\.fa") or die "FASTA: $!\n";
321
$FH{FASTA}->print("$_\n");
322
push @fasta, "$tmpdir/$1\.fa";
323
push @fasta_files_to_be_unlinked,"$tmpdir/$1\.fa";
324
print STDERR "Processing embedded sequence $1\n";
328
elsif (/^\#\#\s*group-tags\s+(.+)/) {
329
$db->preferred_groups(split(/\s+/,$1));
333
elsif (/^\#\#\s*gff-version\s+(\d+)/) {
335
$db->print_gff3_warning() if $gff3;
339
elsif (/^\#\#\s*sequence-region\s+(\S+)\s+(\d+)\s+(\d+)/i) { # header line
340
($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) =
341
($1,'reference','Component',$2,$3,'.','.','.',$gff3 ? "ID=Sequence:$1": qq(Sequence "$1"));
349
($ref,$source,$method,$start,$stop,$score,$strand,$phase,$group) = split "\t";
351
next unless defined $ref;
354
warn "Feature $group is larger than $MAX_BIN. You will have trouble retrieving this feature.\nRerun script with --maxfeature set to a higher power of 10.\n" if $stop-$start+1 > $MAX_BIN;
356
$source = '\N' unless defined $source;
357
$score = '\N' if $score eq '.';
358
$strand = '\N' if $strand eq '.';
359
$phase = '\N' if $phase eq '.';
361
my ($gclass,$gname,$target_start,$target_stop,$attributes) = $db->split_group($group,$gff3);
363
$gclass = [$gclass] unless ref $gclass;
364
$gname = [$gname] unless ref $gname;
366
for (my $i=0; $i < @$gname; $i++) {
367
my $group_class = $gclass->[$i];
368
my $group_name = $gname->[$i];
369
$group_class ||= '\N';
370
$group_name ||= '\N';
371
$target_start ||= '\N';
372
$target_stop ||= '\N';
377
my $gid = $GROUPID{lc join($;,$group_class,$group_name)} ||= $GID++;
378
my $ftypeid = $FTYPEID{lc join($;,$source,$method)} ||= $FTYPEID++;
380
my $bin = bin($start,$stop,$db->min_bin);
381
$FH{ FDATA() }->print( join("\t",$fid,$ref,$start,$stop,$bin,$ftypeid,$score,$strand,$phase,$gid,$target_start,$target_stop),"\n" );
382
$FH{ FGROUP() }->print( join("\t",$gid,$group_class,$group_name),"\n" ) unless $DONE{"fgroup$;$gid"}++;
383
$FH{ FTYPE() }->print( join("\t",$ftypeid,$method,$source),"\n" ) unless $DONE{"ftype$;$ftypeid"}++;
385
foreach (@$attributes) {
386
my ($key,$value) = @$_;
387
my $attributeid = $ATTRIBUTEID{lc $key} ||= $ATTRIBUTEID++;
388
$FH{ FATTRIBUTE() }->print( join("\t",$attributeid,$key),"\n" ) unless $DONE{"fattribute$;$attributeid"}++;
389
$FH{ FATTRIBUTE_TO_FEATURE() }->print( join("\t",$fid,$attributeid,$value),"\n");
392
if ( $FEATURES % 1000 == 0) {
393
my $now = Time::HiRes::time() if $timer;
394
my $elapsed = $timer ? sprintf(" in %5.2fs",$now - $last) : '';
396
print STDERR "$fid features parsed$elapsed...";
397
print STDERR -t STDOUT && !$ENV{EMACS} ? "\r" : "\n";
402
$FH{FASTA}->close if exists $FH{FASTA};
404
printf STDERR "Feature load time %5.2fs\n",(Time::HiRes::time() - $start) if $timer;
407
for my $fasta (@fasta) {
408
warn "Loading fasta ",(-d $fasta?"directory":"file"), " $fasta\n";
409
my $old = select($FH{FDNA()});
410
my $loaded = $db->load_fasta($fasta);
411
warn "$fasta: $loaded records loaded\n";
415
printf STDERR "Fasta load time %5.2fs\n",(Time::HiRes::time() - $start) if $timer;
420
warn "Indexing and analyzing tables. This may take some time (you may see database messages during the process)...\n";
423
$_->close foreach values %FH;
426
warn "Loading feature data and analyzing tables. You may see database messages here...\n";
427
$success &&= system($COMMAND{$_}) == 0 foreach @files;
434
$success &&= !$FAILED;
438
printf STDERR "Total parse & load time %5.2fs\n",(Time::HiRes::time() - $beginning) if $timer;
441
print "SUCCESS: $FEATURES features successfully loaded\n";
444
print "FAILURE: Please see standard error for details\n";
448
if ($SUMMARY_STATS) {
449
warn "Building summary statistics for coverage histograms...\n";
450
$db->build_summary_statistics;
456
foreach (@files,@fasta_files_to_be_unlinked) {
457
unlink "$tmpdir/$_.$$";
461
# load copies of some of the tables into memory
464
print STDERR "loading normalized group, type and attribute information...";
465
$FID = 1 + get_max_id($dbh,'fdata','fid');
466
$GID = 1 + get_max_id($dbh,'fgroup','gid');
467
$FTYPEID = 1 + get_max_id($dbh,'ftype','ftypeid');
468
$ATTRIBUTEID = 1 + get_max_id($dbh,'fattribute','fattribute_id');
469
get_ids($dbh,\%DONE,\%GROUPID,'fgroup','gid','gclass','gname');
470
get_ids($dbh,\%DONE,\%FTYPEID,'ftype','ftypeid','fsource','fmethod');
471
get_ids($dbh,\%DONE,\%ATTRIBUTEID,'fattribute','fattribute_id','fattribute_name');
477
my ($table,$id) = @_;
478
my $sql = "select max($id) from $table";
479
my $result = $dbh->selectcol_arrayref($sql) or die $dbh->errstr;
485
my ($done,$idhash,$table,$id,@columns) = @_;
486
my $columns = join ',',$id,@columns;
487
my $sql = "select $columns from $table";
488
my $sth = $dbh->prepare($sql) or die $dbh->errstr;
489
$sth->execute or die $dbh->errstr;
490
while (my($id,@cols) = $sth->fetchrow_array) {
491
my $key = lc join $;,@cols;
492
$idhash->{$key} = $id;
493
$done->{$table,$id}++;