7
I<bp_seqpart.pl> - Takes one or more sequence files and splits them into a number of load balanced files.
11
bp_seqpart.pl -n <NUM_PARTS> [-h, -p <PREFIX>, -f <FORMAT>, -o <OUT_DIR>] <FILES...>
13
-n number of files to create through partitioning
15
-p prefix for all FASTA file names output, files are of the form <outdir>/<prefix>#.<format>
16
-f format of the files, defaults to FASTA but you can specify anything supported by SeqIO from BioPerl
17
-o output directory where to dump the split sequence files
21
Script wrapping SeqIO that allows partitioning of multiple sequence files into near equal sized parts for later parallel processing. Even if you have 10 input files outputting to 10 files will balance the files to contain similar total length of sequence. ID's are ignored when deciding on how to balance each sequence.
25
B<Matt Oates> - I<Matt.Oates@bristol.ac.uk>
31
User feedback is an integral part of the evolution of this and other
32
Bioperl modules. Send your comments and suggestions preferably to
33
the Bioperl mailing list. Your participation is much appreciated.
35
bioperl-l@bioperl.org - General discussion
36
http://bioperl.org/wiki/Mailing_lists - About the mailing lists
40
Report bugs to the Bioperl bug tracking system to help us keep track
41
of the bugs and their resolution. Bug reports can be submitted via
44
https://redmine.open-bio.org/projects/bioperl/
48
2012-04-03 - Matt Oates
53
B<Getopt::Long> Used to parse command line options.
54
B<Pod::Usage> Used for usage and help output.
55
B<Bio::SeqIO> Used to cut up sequences and parse FASTA.
57
use Getopt::Long; #Deal with command line options
58
use Pod::Usage; #Print a usage man page from the POD comments after __END__
59
use Bio::SeqIO; #Deal with sequence parsing, format and file IO
61
# Command Line Options
62
my $help; #Same again but this time should we output the POD man page defined after __END__
63
my $prefix = 'part'; #Name each part
64
my $format = 'fasta'; #Sequence format we are using, default to fasta
65
my $outdir = '.'; #Use the current directory as default
66
my $num_splits; #Number of files to split into
67
my @partitions; #Details of each partition for the split
69
#Set command line flags and parameters.
70
GetOptions("help|h!" => \$help,
71
"prefix|p=s" => \$prefix,
72
"format|f=s" => \$format,
73
"num-splits|n=i" => \$num_splits,
74
"outdir|o=s" => \$outdir,
75
) or die "Fatal Error: Problem parsing command-line ".$!;
77
#Print out some help if it was asked for or if no arguments were given.
78
pod2usage(-exitstatus => 0, -verbose => 2) if $help;
80
pod2usage(-exitstatus => 0, -verbose => 1, -msg => 'Please specify the number of split parts with -n <N>')
81
unless defined $num_splits;
83
#Setup a bunch of empty partitions including some SeqIO file handles to write to
87
file => Bio::SeqIO->new(
88
-file => ">$outdir/$prefix$_.$format",
94
#Get sequences from all the files specified.
95
foreach my $file (@ARGV) {
96
#Open each input file in turn for reading
97
my $in = Bio::SeqIO->new(
101
#While there are still sequences to consume
102
while ( my $seq = $in->next_seq() ) {
103
#Sort the partitions on how full they are
104
@partitions = sort {$a->{size} <=> $b->{size}} @partitions;
105
#Add the length of the current seq to the smallest partition size
106
my $length = $seq->length;
107
$partitions[0]{size} += $length;
108
#Increase the length of the partition
109
$partitions[0]{length}++;
110
#Write this sequence to the partitions file
111
$partitions[0]{file}->write_seq($seq);
115
#Report some basic statistics after the job
117
foreach my $partition (@partitions) {
118
print STDERR "$outdir/$prefix$part.$format\n";
119
print STDERR "\tSequence count = $partition->{length}\n";
120
print STDERR "\tSequence characters = $partition->{size}\n";