177
177
my $aln = Bio::SimpleAlign->new(-source => 'phylip');
179
# skip blank lines until we see header line
180
# if we see a non-blank line that isn't the seqcount and residuecount line
179
# First, parse up through the header.
180
# If we see a non-blank line that isn't the seqcount and residuecount line
181
181
# then bail out of next_aln (return)
182
HEADER: while ($entry = $self->_readline) {
183
next if $entry =~ /^\s?$/;
184
if ($entry =~ /\s*(\d+)\s+(\d+)/) {
182
while ($entry = $self->_readline) {
183
if ($entry =~ /^\s?$/) {
185
} elsif ($entry =~ /\s*(\d+)\s+(\d+)/) {
185
186
($seqcount, $residuecount) = ($1, $2);
189
$self->warn ("Failed to parse PHYLIP: Did not see a sequence count and residue count.");
190
return unless $seqcount and $residuecount;
192
# first alignment section
194
# First alignment section. We expect to see a name and (part of) a sequence.
193
195
my $idlen = $self->idlength;
196
my $interleaved = $self->interleaved;
197
while( $entry = $self->_readline) {
198
last if( $entry =~ /^\s?$/ && $interleaved );
200
# we've hit the next entry.
201
if( $entry =~ /^\s+(\d+)\s+(\d+)\s*$/) {
202
$self->_pushback($entry);
205
if( $self->longid && $entry =~ /\w/ ) {
207
$entry =~ /^\s*'([^']+)'\s+(.+)$/;
211
$entry =~ /^\s*([^\s]+)\s+(.+)$/;
215
# $name =~ s/[\s\/]/_/g; # not sure how wise is it to do this
216
$name =~ s/_+$//; # remove any trailing _'s
220
$count = scalar @names;
221
$hash{$count} = $str;
223
} elsif( $entry =~ /^\s+(.+)$/ ) {
227
$count = scalar @names;
228
$hash{$count} .= $str;
229
} elsif( $entry =~ /^(.{$idlen})\s*(.*)\s$/ ||
230
$entry =~ /^(.{$idlen})(\S{$idlen}\s+.+)\s$/ # Handle weirdness when id is too long
234
$name =~ s/[\s\/]/_/g;
235
$name =~ s/_+$//; # remove any trailing _'s
239
$count = scalar @names;
240
$hash{$count} = $str;
241
} elsif( $interleaved ) {
242
if( $entry =~ /^(\S+)\s+(.+)/ ||
243
$entry =~ /^(.{$idlen})(.*)\s$/ ) {
246
$name =~ s/[\s\/]/_/g;
247
$name =~ s/_+$//; # remove any trailing _'s
250
$count = scalar @names;
251
$hash{$count} = $str;
253
$self->debug("unmatched line: $entry");
256
$self->throw("Not a valid interleaved PHYLIP file!") if $count > $seqcount;
260
# interleaved sections
262
while( $entry = $self->_readline) {
263
# finish current entry
264
if($entry =~/\s*\d+\s+\d+/){
265
$self->_pushback($entry);
268
$count = 0, next if $entry =~ /^\s$/;
269
$entry =~ /\s*(.*)$/ && do {
273
$hash{$count} .= $str;
275
$self->throw("Not a valid interleaved PHYLIP file! [$count,$seqcount] ($entry)") if $count > $seqcount;
278
return if scalar @names < 1;
282
foreach $name ( @names ) {
284
if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) {
291
$str = $hash{$count};
292
# $str =~ s/[^A-Za-z]//g;
293
#$end = length($str);
296
$self->throw("Length of sequence [$seqname] is not [$residuecount] it is ".CORE::length($hash{$count})."! ")
297
unless CORE::length($hash{$count}) == $residuecount;
299
$seq = Bio::LocatableSeq->new('-seq' => $hash{$count},
300
'-display_id' => $seqname,
302
(defined $end) ? ('-end' => $end) : (),
303
'-alphabet' => $self->alphabet,
308
return $aln if $aln->num_sequences;
198
while ($entry = $self->_readline) {
199
if ($entry =~ /^\s?$/) { # eat the newlines
203
# Names can be in a few different formats:
204
# 1. they can be traditional phylip: 10 chars long, period. If this is the case, that name can have spaces.
205
# 2. they can be hacked with a long ID, as passed in with the flag -longid.
206
# 3. if there is a long ID, the name can have spaces as long as it is wrapped in single quotes.
207
if ($self->longid()) { # 2 or 3
208
if ($entry =~ /^'(.+)'\s+(.+)$/) { # 3. name has single quotes.
211
} else { # 2. name does not have single quotes, so should not have spaces.
212
# therefore, the first part of the line is the name and the rest is the seq.
213
# make sure that the line does not lead with extra spaces.
215
($name, $str) = split (/\s+/,$entry, 2);
217
} else { # 1. traditional phylip.
218
$entry =~ /^(.{10})\s+(.+)$/;
221
$name =~ s/\s+$//; # eat any trailing spaces
225
#clean sequence of spaces:
228
# are we sequential? If so, we should keep adding to the sequence until we've got all the residues.
229
if (($self->interleaved) == 0) {
230
while (length($str) < $residuecount) {
231
$entry = $self->_readline;
234
if ($entry =~ /^\s*$/) { # we ran into a newline before we got a complete sequence: bail!
235
$self->warn("Failed to parse PHYLIP: Sequence $name was shorter than expected: " . length($str) . " instead of $residuecount.");
240
$hash{$count} = $str;
243
# if we've read as many seqs as we're supposed to, move on.
244
if ($count == $seqcount) {
249
# if we are interleaved, we're going to keep seeing chunks of sequence until we get all of it.
250
if ($self->interleaved) {
251
while (length($hash{$seqcount-1}) < $residuecount) {
253
while ($entry = $self->_readline) {
254
if ($entry =~ /^\s*$/) { # eat newlines
255
if ($count != 0) { # there was a newline at an unexpected place!
256
$self->warn("Failed to parse PHYLIP: Interleaved file is missing a segment: saw $count, expected $seqcount.");
260
} else { # start taking in chunks
262
$hash{$count} .= $entry;
265
if ($count >= $seqcount) { # we've read all of the sequences for this chunk, so move on.
271
if ((scalar @names) != $seqcount) {
272
$self->warn("Failed to parse PHYLIP: Did not see the correct number of seqs: saw " . scalar(@names) . ", expected $seqcount.");
275
for ($count=0; $count<$seqcount; $count++) {
276
$str = $hash{$count};
277
my $seqname = $names[$count];
278
if (length($str) != $residuecount) {
279
$self->warn("Failed to parse PHYLIP: Sequence $seqname was the wrong length: " . length($str) . " instead of $residuecount.");
281
$seq = Bio::LocatableSeq->new('-seq' => $hash{$count},
282
'-display_id' => $seqname);
315
290
Title : write_aln