6
split_seq - splits a sequence into equal sized chunks with an optional
11
split_seq -c 10000 [-o 1000] [-i] -f seq.in
15
The script will split sequences into chunks
19
-c Desired length of the resulting sequences.
20
-f Input file (must be FASTA format).
24
-o Overlapping range between the resulting sequences.
25
-i Create an index file with the resulting sequence files. This is
26
useful if you want to pass this list as input arguments into
27
another programs (i.e. CLUSTAL, HMMER, etc.).
33
User feedback is an integral part of the evolution of this and other
34
Bioperl modules. Send your comments and suggestions preferably to
35
the Bioperl mailing list. Your participation is much appreciated.
37
bioperl-l@bioperl.org - General discussion
38
http://bioperl.org/wiki/Mailing_lists - About the mailing lists
42
Report bugs to the Bioperl bug tracking system to help us keep track
43
of the bugs and their resolution. Bug reports can be submitted via the
46
https://redmine.open-bio.org/projects/bioperl/
50
Ewan Birney E<lt>birney-at-ebi.ac.ukE<gt>
51
Mauricio Herrera Cuadra E<lt>mauricio at open-bio.orgE<gt>
56
# Modules, pragmas and variables to use
60
use vars qw($opt_c $opt_o $opt_i $opt_f $index_file);
62
# Gets options from the command line
63
GetOptions qw(-c=i -o:i -i -f=s);
65
# If no mandatory options are given prints an error and exits
67
print "ERROR: No chunk size has been specified.\n" and exit();
69
print "ERROR: No FASTA file has been specified.\n" and exit();
72
# Declares offset size
73
my $offset = $opt_o ? $opt_o : "0";
75
# Opens the FASTA file
76
my $in = Bio::SeqIO->new(
80
print "==> Opening FASTA file:\t\t\t\t$opt_f\n";
82
# Reads the next sequence object
83
while (my $seq = $in->next_seq()) {
85
# Reads the ID for the sequence and prints it
87
print "--> The ID for this sequence is:\t\t$id\n";
89
# Reads the description for the sequence and prints it
90
my $desc = $seq->desc();
91
print "--> The description for this sequence is:\t$desc\n";
93
# Gets sequence length and prints it
94
my $seq_length = $seq->length();
95
print "--> The length of this sequence is:\t\t$seq_length\n";
97
# If the chunk size is bigger than the sequence length prints the error and exits
98
(print "ERROR: Specified chunk size is bigger than sequence length.\n" and exit()) if ($opt_c > $seq_length);
100
# Creates a directory for writing the resulting files
101
mkdir("split", 0755) unless -e "split" and -d "split";
103
# Creates the INDEX file if the option was given
105
$index_file = "$id.c$opt_c.o$offset.INDEX";
106
open(FH, ">", $index_file) or die("Unable to create file: $index_file ($!)");
109
# Loops through the sequence
110
for (my $i = 1; $i < $seq_length; $i += $opt_c) {
111
my $end = (($i + $opt_c) > $seq_length) ? ($seq_length + 1) : ($i + $opt_c);
112
my $seq_range = (($i + $opt_c) > $seq_length) ? "$i-".($end - 1) : "$i-$end";
114
$id .= "_$seq_range";
116
# Stores chunk into its corresponding FASTA file
117
my $out = Bio::SeqIO->new(
118
-file => ">split/$id.faa",
121
my $trunc_seq = $seq->trunc($i, $end - 1);
123
$out->write_seq($trunc_seq);
124
print "==> Sequence chunk:\t$seq_range\tstored in file:\tsplit/$id.faa\n";
126
# Prints the current file name into the INDEX file if the option was given
127
print FH "split/$id.faa\n" if $opt_i;
129
# Decreases the $i value with the offset value
133
# Closes the INDEX file if the option was given
135
print "==> INDEX stored in file:\t\t\t$index_file\n";