1
# $Id: table.pm,v 1.4.4.1 2006/10/02 23:10:30 sendu Exp $
3
# BioPerl module for Bio::SeqIO::table
5
# Cared for by Hilmar Lapp <hlapp at gmx.net>
9
# (c) Hilmar Lapp, hlapp at gmx.net, 2005.
10
# (c) GNF, Genomics Institute of the Novartis Research Foundation, 2005.
12
# You may distribute this module under the same terms as perl itself.
13
# Refer to the Perl Artistic License (see the license accompanying this
14
# software package, or see http://www.perl.com/language/misc/Artistic.html)
15
# for the terms under which you may use, modify, and redistribute this module.
17
# THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
18
# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
19
# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
22
# POD documentation - main docs before the code
26
Bio::SeqIO::table - sequence input/output stream from a delimited table
30
#It is probably best not to use this object directly, but
31
#rather go through the SeqIO handler system. Go:
33
$stream = Bio::SeqIO->new(-file => $filename, -format => 'table');
35
while ( my $seq = $stream->next_seq() ) {
36
# do something with $seq
41
This class transforms records in a table-formatted text file into
44
A table-formatted text file of sequence records for the purposes of
45
this module is defined as a text file with each row corresponding to a
46
sequence, and the attributes of the sequence being in different
47
columns. Columns are delimited by a common delimiter, for instance tab
50
The module permits specifying which columns hold which type of
51
annotation. The semantics of certain attributes, if present, are
52
pre-defined, e.g., accession number and sequence. Additional
53
attributes may be added to the annotation bundle.
59
User feedback is an integral part of the evolution of this and other
60
Bioperl modules. Send your comments and suggestions preferably to one
61
of the Bioperl mailing lists. Your participation is much appreciated.
63
bioperl-l@bioperl.org - General discussion
64
http://bioperl.org/wiki/Mailing_lists - About the mailing lists
68
Report bugs to the Bioperl bug tracking system to help us keep track
69
the bugs and their resolution.
71
Bug reports can be submitted via email or the web:
73
http://bugzilla.open-bio.org/
75
=head1 AUTHOR - Hilmar Lapp
77
Email hlapp at gmx.net
81
The rest of the documentation details each of the object
82
methods. Internal methods are usually preceded with a _
86
# Let the code begin...
88
package Bio::SeqIO::table;
92
use Bio::Seq::SeqFactory;
93
use Bio::Annotation::Collection;
94
use Bio::Annotation::SimpleValue;
96
use base qw(Bio::SeqIO);
101
Usage : $stream = Bio::SeqIO->new(-file => $filename, -format => 'table')
102
Function: Returns a new seqstream
103
Returns : A Bio::SeqIO stream for a table format
104
Args : Named parameters:
106
-file name of file to read
107
-fh filehandle to attach to
108
-comment leading character(s) introducing a comment line
109
-header the number of header lines to skip; the first
110
non-comment header line will be used to obtain
111
column names; column names will be used as the
112
default tags for attaching annotation.
113
-delim the delimiter for columns as a regular expression;
114
consecutive occurrences of the delimiter will
116
-display_id the one-based index of the column containing
117
the display ID of the sequence
118
-accession_number the one-based index of the column
119
containing the accession number of the sequence
120
-seq the one-based index of the column containing
121
the sequence string of the sequence
122
-species the one-based index of the column containing the
123
species for the sequence record; if not a
124
number, will be used as the static species
125
common to all records
126
-annotation if provided and a scalar (but see below), a
127
flag whether or not all additional columns are
128
to be preserved as annotation, the tags used
129
will either be 'colX' if there is no column
130
header and where X is the one-based column
131
index, and otherwise the column headers will be
134
if a reference to an array, or a square
135
bracket-enclosed string of comma-delimited
136
values, only those columns (one-based index)
137
will be preserved as annotation, tags as before;
139
if a reference to a hash, or a curly
140
braces-enclosed string of comma-delimited key
141
and value pairs in alternating order, the keys
142
are one-based column indexes to be preserved,
143
and the values are the tags under which the
144
annotation is to be attached; if not provided or
145
supplied as undef, no additional annotation will
147
-colnames a reference to an array of column labels, or a
148
string of comma-delimited labels, denoting the
149
columns to be converted into annotation; this is
150
an alternative to -annotation and will be
151
ignored if -annotation is also supplied with a
153
-trim flag determining whether or not all values should
154
be trimmed of leading and trailing white space
157
Additional arguments may be used to e.g. set factories and
158
builders involved in the sequence object creation (see the
165
my($self,@args) = @_;
167
# chained initialization
168
$self->SUPER::_initialize(@args);
181
$self->_rearrange([qw(COMMENT
193
# store options and apply defaults
194
$self->comment_char(defined($cmtchars) ? $cmtchars : "#")
195
if (!defined($self->comment_char)) || defined($cmtchars);
196
$self->delimiter(defined($delim) ? $delim : "\t")
197
if (!defined($self->delimiter)) || defined($delim);
198
$self->header($header) if defined($header);
199
$self->trim_values($trim) if defined($trim);
203
$attrs->{-display_id} = $display_id if defined($display_id);
204
$attrs->{-accession_number} = $accnr if defined($accnr);
205
$attrs->{-seq} = $seq if defined($seq);
206
if (defined($taxon)) {
207
if (ref($taxon) || ($taxon =~ /^\d+$/)) {
208
# either a static object, or a column reference
209
$attrs->{-species} = $taxon;
211
# static species as a string
212
$attrs->{-species} = Bio::Species->new(
213
-classification => [reverse(split(' ',$taxon))]);
216
$self->attribute_map($attrs);
218
# annotation columns, if any
219
if ($useann && !ref($useann)) {
220
# it's a scalar; check whether this is in fact an array or
221
# hash as a string rather than just a flag
222
if ($useann =~ /^\[(.*)\]$/) {
223
$useann = [split(/[,;]/,$1)];
224
} elsif ($useann =~ /^{(.*)}$/) {
225
$useann = {split(/[,;]/,$1)};
226
} # else it is probably indeed just a flag
230
if (ref($useann) eq "ARRAY") {
231
my $has_header = ($self->header > 0);
233
foreach my $i (@$useann) {
234
$ann_map->{$i} = $has_header ? undef : "col$i";
237
# no special handling necessary
240
$self->annotation_map($ann_map);
242
$self->keep_annotation($useann || $colnames);
243
# annotation columns, if any
244
if ($colnames && !ref($colnames)) {
245
# an array as a string
246
$colnames =~ s/^\[(.*)\]$/$1/;
247
$colnames = [split(/[,;]/,$colnames)];
249
$self->annotation_columns($colnames) if ref($colnames);
252
# make sure we have a factory defined
253
if(!defined($self->sequence_factory)) {
254
$self->sequence_factory(
255
Bio::Seq::SeqFactory->new(-verbose => $self->verbose(),
256
-type => 'Bio::Seq::RichSeq'));
263
Usage : $seq = $stream->next_seq()
264
Function: returns the next sequence in the stream
265
Returns : Bio::Seq::RichSeq object
273
# skip until not a comment and not an empty line
274
my $line_ok = $self->_next_record();
276
# if there is a header but we haven't read past it yet then do so now
277
if ($line_ok && (! $self->_header_skipped) && $self->header) {
278
$line_ok = $self->_parse_header();
279
$self->_header_skipped(1);
282
# return if we reached end-of-file
283
return unless $line_ok;
285
# otherwise, parse the record
288
my @cols = $self->_get_row_values();
289
# trim leading and trailing whitespace and quotes if desired
290
if ($self->trim_values) {
291
for(my $i = 0; $i < scalar(@cols); $i++) {
293
# trim off whitespace
294
$cols[$i] =~ s/^\s+//;
295
$cols[$i] =~ s/\s+$//;
296
# trim off double quotes
303
# assign values for columns in the attribute map
304
my $attrmap = $self->_attribute_map;
306
foreach my $attr (keys %$attrmap) {
307
if ((!ref($attrmap->{$attr})) && ($attrmap->{$attr} =~ /^\d+$/)) {
308
# this is a column index, add to instantiation parameters
309
$params{$attr} = $cols[$attrmap->{$attr}];
311
# not a column index; we assume it's a static value
312
$params{$attr} = $attrmap->{$attr};
316
# add annotation columns to the annotation bundle
317
my $annmap = $self->_annotation_map;
318
if ($annmap && %$annmap) {
319
my $anncoll = Bio::Annotation::Collection->new();
320
foreach my $col (keys %$annmap) {
321
next unless $cols[$col]; # skip empty columns!
322
$anncoll->add_Annotation(
323
Bio::Annotation::SimpleValue->new(-value => $cols[$col],
324
-tagname=> $annmap->{$col}));
326
$params{'-annotation'} = $anncoll;
329
# ask the object builder to add the slots that we've gathered
330
my $builder = $self->sequence_builder();
331
$builder->add_slot_value(%params);
332
# and instantiate the object
333
my $seq = $builder->make_object();
342
Usage : $obj->comment_char($newval)
343
Function: Get/set the leading character(s) designating a line as
346
Returns : value of comment_char (a scalar)
347
Args : on set, new value (a scalar or undef, optional)
355
return $self->{'comment_char'} = shift if @_;
356
return $self->{'comment_char'};
362
Usage : $obj->header($newval)
363
Function: Get/set the number of header lines to skip before the
364
rows containing actual sequence records.
366
If set to zero or undef, means that there is no header and
367
therefore also no column headers.
370
Returns : value of header (a scalar)
371
Args : on set, new value (a scalar or undef, optional)
379
return $self->{'header'} = shift if @_;
380
return $self->{'header'};
386
Usage : $obj->delimiter($newval)
387
Function: Get/set the column delimiter. This will in fact be
388
treated as a regular expression. Consecutive occurrences
389
will not be collapsed to a single one.
392
Returns : value of delimiter (a scalar)
393
Args : on set, new value (a scalar or undef, optional)
401
return $self->{'delimiter'} = shift if @_;
402
return $self->{'delimiter'};
407
Title : attribute_map
408
Usage : $obj->attribute_map($newval)
409
Function: Get/set the map of sequence object initialization
410
attributes (keys) to one-based column index.
412
Attributes will usually need to be prefixed by a dash, just
413
as if they were passed to the new() method of the sequence
417
Returns : value of attribute_map (a reference to a hash)
418
Args : on set, new value (a reference to a hash or undef, optional)
426
# internally we store zero-based maps - so we need to convert back
430
# allow for and protect against undef
431
return delete $self->{'_attribute_map'} unless defined($arg);
432
# copy to avoid side-effects
433
my $attr_map = {%$arg};
434
foreach my $key (keys %$attr_map) {
435
if ((!ref($attr_map->{$key})) && ($attr_map->{$key} =~ /^\d+$/)) {
439
$self->{'_attribute_map'} = $attr_map;
441
# there may not be a map
442
return unless exists($self->{'_attribute_map'});
443
# we need to copy in order not to override the stored map!
444
my %attr_map = %{$self->{'_attribute_map'}};
445
foreach my $key (keys %attr_map) {
446
if ((!ref($attr_map{$key})) && ($attr_map{$key} =~ /^\d+$/)) {
453
=head2 annotation_map
455
Title : annotation_map
456
Usage : $obj->annotation_map($newval)
457
Function: Get/set the mapping between one-based column indexes
458
(keys) and annotation tags (values).
460
Note that the map returned by this method may change after
461
the first next_seq() call if the file contains a column
462
header and no annotation keys have been predefined in the
463
map, because upon reading the column header line the tag
464
names will be set automatically.
466
Note also that the map may reference columns that are used
467
as well in the sequence attribute map.
470
Returns : value of annotation_map (a reference to a hash)
471
Args : on set, new value (a reference to a hash or undef, optional)
479
# internally we store zero-based maps - so we need to convert back
483
# allow for and protect against undef
484
return delete $self->{'_annotation_map'} unless defined($arg);
485
# copy to avoid side-effects
486
my $ann_map = {%$arg};
487
# make sure we sort the keys numerically or otherwise we may
488
# clobber a key with a higher index
489
foreach my $key (sort { $a <=> $b } keys(%$ann_map)) {
490
$ann_map->{$key-1} = $ann_map->{$key};
491
delete $ann_map->{$key};
493
$self->{'_annotation_map'} = $ann_map;
494
# also make a note that we want to keep annotation
495
$self->keep_annotation(1);
497
# there may not be a map
498
return unless exists($self->{'_annotation_map'});
499
# we need to copy in order not to override the stored map!
500
my %ann_map = %{$self->{'_annotation_map'}};
501
# here we need to sort numerically in reverse order ...
502
foreach my $key (sort { $b <=> $a } keys(%ann_map)) {
503
$ann_map{$key+1} = $ann_map{$key};
504
delete $ann_map{$key};
509
=head2 keep_annotation
511
Title : keep_annotation
512
Usage : $obj->keep_annotation($newval)
513
Function: Get/set flag whether or not to keep values from
514
additional columns as annotation.
516
Additional columns are all those columns in the input file
517
that aren't referenced in the attribute map.
520
Returns : value of keep_annotation (a scalar)
521
Args : on set, new value (a scalar or undef, optional)
529
return $self->{'keep_annotation'} = shift if @_;
530
return $self->{'keep_annotation'};
533
=head2 annotation_columns
535
Title : annotation_columns
536
Usage : $obj->annotation_columns($newval)
537
Function: Get/set the names (labels) of the columns to be used for
540
This is an alternative to using annotation_map. In order to
541
have any effect, it must be set before the first call of
542
next_seq(), and obviously there must be a header line (or
543
row) too giving the column labels.
546
Returns : value of annotation_columns (a reference to an array)
547
Args : on set, new value (a reference to an array of undef, optional)
552
sub annotation_columns{
555
return $self->{'annotation_columns'} = shift if @_;
556
return $self->{'annotation_columns'};
562
Usage : $obj->trim_values($newval)
563
Function: Get/set whether or not to trim leading and trailing
564
whitespace off all column values.
566
Returns : value of trim_values (a scalar)
567
Args : on set, new value (a scalar or undef, optional)
575
return $self->{'trim_values'} = shift if @_;
576
return $self->{'trim_values'};
579
=head1 Internal methods
581
All methods with a leading underscore are not meant to be part of the
582
'official' API. They are for use by this module only, consider them
583
private unless you are a developer trying to modify this module.
587
=head2 _attribute_map
589
Title : _attribute_map
590
Usage : $obj->_attribute_map($newval)
591
Function: Get only. Same as attribute_map, but zero-based indexes.
593
Note that any changes made to the returned map will change
594
the map used by this instance. You should know what you are
595
doing if you modify the returned value (or if you call this
596
method in the first place).
599
Returns : value of _attribute_map (a reference to a hash)
608
return $self->{'_attribute_map'};
611
=head2 _annotation_map
613
Title : _annotation_map
614
Usage : $obj->_annotation_map($newval)
615
Function: Get only. Same as annotation_map, but with zero-based indexes.
617
Note that any changes made to the returned map will change
618
the map used by this instance. You should know what you are
619
doing if you modify the returned value (or if you call this
620
method in the first place).
623
Returns : value of _annotation_map (a reference to a hash)
632
return $self->{'_annotation_map'};
635
=head2 _header_skipped
637
Title : _header_skipped
638
Usage : $obj->_header_skipped($newval)
639
Function: Get/set the flag whether the header was already
640
read (and skipped) or not.
642
Returns : value of _header_skipped (a scalar)
643
Args : on set, new value (a scalar or undef, optional)
651
return $self->{'_header_skipped'} = shift if @_;
652
return $self->{'_header_skipped'};
659
Function: Navigates the underlying file to the next record.
661
For row-based records in delimited text files, this will
662
skip all empty lines and lines with a leading comment
665
This method is here is to serve as a hook for other formats
666
that conceptually also represent tables but aren't
667
formatted as row-based text files.
670
Returns : TRUE if the navigation was successful and FALSE
671
otherwise. Unsuccessful navigation will usually be treated
672
as an end-of-file condition.
681
my $cmtcc = $self->comment_char;
682
my $line = $self->_readline();
684
# skip until not a comment and not an empty line
685
while (defined($line)
686
&& (($cmtcc && ($line =~ /^\s*$cmtcc/))
687
|| ($line =~ /^\s*$/))) {
688
$line = $self->_readline();
691
return $self->{'_line'} = $line;
696
Title : _parse_header
698
Function: Parse the table header and navigate past it.
700
This method is called if the number of header rows has been
701
specified equal to or greater than one, and positioned at
702
the first header line (row). By default the first header
703
line (row) is used for setting column names, but additional
704
lines (rows) may be skipped too. Empty lines and comment
705
lines do not count as header lines (rows).
707
This method will call _next_record() to navigate to the
708
next header line (row), if there is more than one header
709
line (row). Upon return, the file is presumed to be
710
positioned at the first record after the header.
712
This method is here is to serve as a hook for other formats
713
that conceptually also represent tables but aren't
714
formatted as row-based text files.
716
Note however that the only methods used to access file
717
content or navigate the position are _get_row_values() and
718
_next_record(), so it should usually suffice to override
722
Returns : TRUE if navigation past the header was successful and FALSE
723
otherwise. Unsuccessful navigation will usually be treated
724
as an end-of-file condition.
733
# the first header line contains the column headers, see whether
735
if ($self->keep_annotation) {
736
my @colnames = $self->_get_row_values();
737
# trim leading and trailing whitespace if desired
738
if ($self->trim_values) {
739
# trim off whitespace
740
@colnames = map { $_ =~ s/^\s+//; $_ =~ s/\s+$//; $_; } @colnames;
741
# trim off double quotes
742
@colnames = map { $_ =~ s/^"//; $_ =~ s/"$//; $_; } @colnames;
744
# build or complete annotation column map
745
my $annmap = $self->annotation_map || {};
747
# check whether columns have been defined by name rather than index
748
if (my $anncols = $self->annotation_columns) {
749
# first sanity check: all column names must map
750
my %colmap = map { ($_,1); } @colnames;
751
foreach my $col (@$anncols) {
752
if (!exists($colmap{$col})) {
753
$self->throw("no such column labeled '$col'");
756
# now map to the column indexes
757
%colmap = map { ($_,1); } @$anncols;
758
for (my $i = 0; $i < scalar(@colnames); $i++) {
759
if (exists($colmap{$colnames[$i]})) {
760
$annmap->{$i+1} = $colnames[$i];
764
# no columns specified, default to all non-attribute columns
765
for (my $i = 0; $i < scalar(@colnames); $i++) {
766
$annmap->{$i+1} = $colnames[$i];
768
# subtract all attribute-referenced columns
769
foreach my $attrcol (values %{$self->attribute_map}) {
770
if ((!ref($attrcol)) && ($attrcol =~ /^\d+$/)) {
771
delete $annmap->{$attrcol};
776
# fill in where the tag names weren't pre-defined
777
for (my $i = 0; $i < scalar(@colnames); $i++) {
778
if (exists($annmap->{$i+1}) && ! defined($annmap->{$i+1})) {
779
$annmap->{$i+1} = $colnames[$i];
783
$self->annotation_map($annmap);
786
# now read past the header
787
my $header_lines = $self->header;
789
while (defined($line_ok) && ($header_lines > 0)) {
790
$line_ok = $self->_next_record();
797
=head2 _get_row_values
799
Title : _get_row_values
801
Function: Get the values for the current line (or row) as an array in
802
the order of columns.
804
This method is here is to serve as a hook for other formats
805
that conceptually also represent tables but aren't
806
formatted as row-based text files.
809
Returns : An array of column values for the current row.
817
my $delim = $self->delimiter;
818
my $line = $self->{'_line'};
820
my @cols = split(/$delim/,$line);