3
# PROGRAM : clustalw.pl
4
# PURPOSE : Demonstrate possible uses of Bio::Tools::Run::Alignment::Clustalw.pm
5
# AUTHOR : Peter Schattner schattner@alum.mit.edu
6
# CREATED : Oct 06 2000
7
# REVISION : $Id: clustalw.pl,v 1.5 2001/06/02 21:17:20 peter Exp $
11
# You will need to have installed clustalw and to ensure that Clustalw.pm can find it.
12
# This can be done in (at least) two ways:
13
# export PATH=$PATH:/home/peter/clustalw1.8 or
14
# 1. define an environmental variable CLUSTALDIR:
15
# export CLUSTALDIR=/home/peter/clustalw1.8 or
16
# 2. include a definition of an environmental variable CLUSTALDIR in every script that will
18
# BEGIN {$ENV{CLUSTALDIR} = '/home/peter/clustalw1.8/'; }
22
# We are going to demonstrate 3 possible applications of Clustalw.pm:
23
# 1. Test effect of varying clustalw alignment parameter(s) on resulting alignment
24
# 2. Test effect of changing the order that sequences are added to the alignment
25
# on the resulting alignment
26
# 3. Test effect of incorpotating an "anchor point" in the alignment process
29
# Before we can do any tests, we need to set up the environment, create the factory
30
# and read in the (unaligned) sequences.
32
# Modify (and un-comment) the following line as required to point to clustalw program directory on your system
34
# $ENV{CLUSTALDIR} = '/home/peter/clustalw1.8/';
38
use Bio::Tools::Run::Alignment::Clustalw;
44
# set some default values
45
my $infile = 't/cysprot1a.fa';
46
my @params = ('quiet' => 1 );
47
my $do_only = '123'; # string listing examples to be executed. Default is to execute
48
# all tests (ie 1,2 and 3)
49
my $param = 'ktuple'; # parameter to be varied in example 1
50
my $startvalue = 1; # initial value for parameter $param
51
my $stopvalue = 3; # final value for parameter $param
52
my $regex = 'W[AT]F'; # regular expression for 'anchoring' alignment in example 3
53
my $extension = 30; # distance regexp anchor should be extended in each direction
54
# for local alignment in example 3
55
my $helpflag = 0; # Flag to show usage info.
58
my @argv = @ARGV; # copy ARGV before GetOptions() massacres it.
60
&GetOptions("h!" => \$helpflag, "help!" => \$helpflag,
64
"start=i" => \$startvalue,
65
"stop=i" => \$stopvalue,
66
"ext=i" => \$extension,
67
"regex=s" => \$regex,) ;
69
if ($helpflag) { &clustalw_usage(); exit 0;}
71
# create factory & set user-specified global clustalw parameters
72
foreach my $argv (@argv) {
73
unless ($argv =~ /^(.*)=>(.*)$/) { next;}
74
push (@params, $1 => $2);
76
my $factory = Bio::Tools::Run::Alignment::Clustalw->new(@params);
79
# put unaligned sequences in a Bio::Seq array
80
my $str = Bio::SeqIO->new(-file=> $infile, '-format' => 'Fasta');
81
my ($paramvalue, $aln, $subaln, @consensus, $seq_num, $string, $strout, $id);
83
while ( my $seq = $str->next_seq() ) { push (@seq_array, $seq) ;}
85
# Do each example that has digit present in variable $do_only
87
/1/ && &vary_params();
88
/2/ && &vary_align_order();
89
/3/ && &anchored_align();
93
#################################################
94
# vary_params(): Example demonstrating varying of clustalw parameter
99
print "Beginning parameter-varying example... \n";
101
# Now we'll create several alignments, 1 for each value of the selected parameter.
102
# We also compute a simple consensus string for each alignment.
103
# (In the default case, we vary the "ktuple" parameter, creating 3 alignments,
104
# using ktuple values from 1 to 3.)
107
for ($paramvalue = $startvalue; $paramvalue < ($stopvalue + 1); $paramvalue++) {
108
$factory->$param($paramvalue); # set parameter value
109
print "Performing alignment with $param = $paramvalue \n";
110
$aln = $factory->align(\@seq_array);
111
$string = $aln->consensus_string(); # Get consensus of alignment
112
# convert '?' to 'X' at non-consensus positions
114
$consensus[$index] = Bio::Seq->new(-id=>"$param=$paramvalue",-seq=>$string);
117
# Compare consensus strings for alignments with different $param values by
118
# making an alignment of the different consensus strings
119
# $factory->ktuple(1); # set ktuple parameter
120
print "Performing alignment of $param consensus sequences \n";
121
$aln = $factory->align(\@consensus);
122
$strout = Bio::AlignIO->newFh('-format' => 'msf');
129
#################################################
130
# vary_align_order():
132
# For our second example, we'll test the effect of changing the order
133
# that sequences are added to the alignment
135
sub vary_align_order {
137
print "\nBeginning alignment-order-changing example... \n";
139
@consensus = (); # clear array
140
for ($seq_num = 0; $seq_num < scalar(@seq_array); $seq_num++) {
141
my $obj_out = shift @seq_array; # remove one Seq object from array and save
142
$id = $obj_out->id();
143
# align remaining sequences
144
print "Performing alignment with sequence $id left out \n";
145
$subaln = $factory->align(\@seq_array);
146
# add left-out sequence to subalignment
147
$aln = $factory->profile_align($subaln,$obj_out);
148
$string = $aln->consensus_string(); # Get consensus of alignment
149
# convert '?' to 'X' for non-consensus positions
151
$consensus[$seq_num] = Bio::Seq->new(-id=>"$id left out",-seq=>$string);
152
push @seq_array, $obj_out; # return Seq object for next (sub) alignment
155
# Compare consensus strings for alignments created in different orders
156
# $factory->ktuple(1); # set ktuple parameter
157
print "\nPerforming alignment of consensus sequences for different reorderings \n";
158
print "Each consensus is labeled by the sequence which was omitted in the initial alignment \n";
159
$aln = $factory->align(\@consensus);
160
$strout = Bio::AlignIO->newFh('-format' => 'msf');
167
#################################################
170
# For our last example, we'll test a way to perform a local alignment by
171
# "anchoring" the alignment to a regular expression. This is similar
172
# to the approach taken in the recent dbclustal program.
173
# In principle, we could write a script to search for a good regular expression
174
# to use. Instead, here we'll simply choose one manually after looking at the previous
179
my @local_array = ();
180
my @seqs_not_matched = ();
182
print "\n Beginning anchored-alignment example... \n";
185
for ($seq_num = 0; $seq_num < scalar(@seq_array); $seq_num++) {
186
my $seqobj = $seq_array[$seq_num];
187
my $seq = $seqobj->seq();
188
my $id = $seqobj->id();
189
# if $regex is not found in the sequence, save sequence id name and set array value =0
191
unless ($seq =~/$regex/) {
192
$local_array[$seq_num] = 0;
193
push (@seqs_not_matched, $id) ;
195
# find positions of start and of subsequence to be aligned
196
my $match_start_pos = length($`);
197
my $match_stop_pos = length($`) + length($&);
198
my $start = ($match_start_pos - $extension) > 1 ? ($match_start_pos - $extension) +1 : 1;
199
my $stop = ($match_stop_pos + $extension) < length($seq) ?
200
($match_stop_pos + $extension) : length($seq);
201
my $string = $seqobj->subseq($start, $stop);
203
$local_array[$seq_num] = Bio::Seq->new(-id=>$id, -seq=>$string);
205
@local_array = grep $_ , @local_array; # remove array entries with no match
207
# Perform alignment on the local segments of the sequences which match "anchor"
208
$aln = $factory->align(\@local_array);
209
my $consensus = $aln->consensus_string(); # Get consensus of local alignment
211
if (scalar(@seqs_not_matched) ) {
212
print " Sequences not matching $regex : @seqs_not_matched \n"
214
print " All sequences match $regex : @seqs_not_matched \n"
216
print "Consensus sequence of local alignment: $consensus \n";
227
#-----------------------
228
# Prints usage information for general parameters.
230
print STDERR <<"QQ_PARAMS_QQ";
232
Command-line accessible script variables and commands:
233
-------------------------------
234
-h : Display this usage info and exit.
235
-in <str> : File containing input sequences in fasta format (default = $infile) .
236
-do <str> : String listing examples to be executed. Default is to execute
237
all tests (ie default = '123')
238
-param <str> : Parameter to be varied in example 1. Any clustalw parameter
239
which takes inteer values can be varied (default = 'ktuple')
240
-start <int> : Initial value for varying parameter in example 1 (default = 1)
241
-stop <int> : Final value for varying parameter (default = 3)
242
-regex <str> : Regular expression for 'anchoring' alignment in example 3 (default = $regex)
243
-ext <int> : Distance regexp anchor should be extended in each direction
244
for local alignment in example 3 (default = 30)
246
In addition, any valid Clustalw parameter can be set using the syntax "parameter=>value" as
249
So a typical command lines might be:
250
> clustalw.pl -param=pairgap -start=2 -stop=3 -do=1 "ktuple=>3"
252
> clustalw.pl -ext=10 -regex='W[AST]F' -do=23 -in='t/cysprot1a.fa'