2
#####################################################
9
# (a.vincent.0312@gmail.com)
11
# FastAlign is a perl script which uses the heuristic method
12
# of tfasty36 for find similarity between a query sequence
13
# in amino acids and a sequence in nucleotides. It provides
14
# a more intuitive output to find exon-intron junctions.
15
# The query string is in amino acids and the hit string is
16
# in nucleotides. There are extra nucleotides at the end of
17
# the hit string (option -diff and by default = 10), that
18
# allow to verify if the intron start with common rules
19
# (5'-GTGCGA-... for group II intron and after an exonic T
20
# for group I intron).
22
# The FASTA version can be changed by the user by changing
23
# the line with tfasty36 for tfastyXX.
25
# If you have Emboss, you can genarate a graphic with option
28
# For complete help: type perl fastalign.pl -help
29
# Last Update: 01/06/13
30
#######################################################
35
Copyright (C) 2013 Antony Vincent
39
This program is free software: you can redistribute it and/or modify
40
it under the terms of the GNU General Public License as published by
41
the Free Software Foundation, either version 3 of the License, or
42
(at your option) any later version.
44
This program is distributed in the hope that it will be useful,
45
but WITHOUT ANY WARRANTY; without even the implied warranty of
46
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
47
GNU General Public License for more details.
49
You should have received a copy of the GNU General Public License
50
along with this program. If not, see <http://www.gnu.org/licenses/>.
61
## Set the default variables
74
'graph=s' => \$graphic,
81
or die "Incorrect usage! Try perl fastalign.pl -help for an exhaustif help.\n";
86
print "Two options are required:\n";
87
print " -seq: Your sequence in amino acids\n";
88
print " -db: The sequence in nucleotides (Could be a whole genome or a partial sequence...)\n";
90
print "There are few miscellaneous options:\n";
91
print " -i: Minimum identity percentage (default = 75)\n";
92
print " -l: Minimum match length (default = 50)\n";
93
print " -diff: Difference between the hit string and the alignement (default = 10)\n";
94
print " -out: Name of the output file (default = output.txt)\n";
95
print " -graph: If this option = 1, a graph will be generated (default = no)\n";
101
open (WRITE, ">>$out.txt"); ## Open the output file
102
print WRITE " Fasta\n";
104
print WRITE " Align\n\n";
105
print WRITE "Date:", $date, "\n";
106
print WRITE "PARAMETERS\n";
107
print WRITE "Minimum identity =", $identity, "\n";
108
print WRITE "Minimum length =", $length, "\n";
109
print WRITE "Diff =", $diff, "\n\n";
113
open (WRITE, ">>$out.txt"); ## Open the output file
114
open (WRITE2, ">>lindna.lnp"); ## Open the lindna config file
116
## start the lindna header
117
print WRITE2 "start";
123
my $seqio_obj = Bio::SeqIO->new(-file => "$library", -format => "fasta" );
124
my $seq_obj = $seqio_obj->next_seq;
125
my $count_obj = $seq_obj->length;
126
print WRITE2 "$count_obj";
128
print WRITE2 "group";
133
print "No graphic generated \n";
137
my $fasta = "tfasty36"; # <------ You can change this line for newest fasta algorithm
139
my $command = "$fasta $query $library";
141
open $fh,"$command |" || die("cannot run fasta cmd of $command: $!\n");
143
my $searchio = Bio::SearchIO->new(-format => 'fasta', -fh => $fh);
144
print WRITE "Fasta algorithm:", $fasta, "\n\n";
145
## start the parsing part of the script
147
while( my $result = $searchio->next_result ) {
148
## $result is a Bio::Search::Result::ResultI compliant object
149
while( my $hit = $result->next_hit ) {
150
## $hit is a Bio::Search::Hit::HitI compliant object
151
while( my $hsp = $hit->next_hsp ) {
152
## $hsp is a Bio::Search::HSP::HSPI compliant object
153
if( $hsp->length('total') > $length ) {
154
if ( $hsp->percent_identity >= $identity ) {
156
## Generals informations
157
print WRITE "Rank=", $hsp->rank, "\n",
158
"Query=", $result->query_name, "\n",
159
"Hit=", $hit->name, "\n" ,
160
"Length=", $hsp->length('total'), "\n",
161
"Percent_id=", $hsp->percent_identity, "\n",
162
"Strand=", $hsp->strand('hit'), "\n";
170
## Search for nucleotide sequences
172
my $start_hit = $hsp->start('hit'), "\n";
173
my $end_hit = $hsp->end('hit') , "\n";
174
my $in = Bio::SeqIO->new(-file => "$library" , '-format' => 'fasta');
176
while ( my $seq = $in->next_seq() ) {#1
178
## looking for query position
179
my $start_query = $hsp->start('query'), "\n";
180
my $end_query = $hsp->end('query') , "\n";
182
my $seqobj = Bio::PrimarySeq->new ( -seq => $hsp->query_string);
183
my $polypeptide_3char = Bio::SeqUtils->seq3($seqobj);
185
## modify the homology string
186
my $homo = $hsp->homology_string;
194
print WRITE "Query($start_query,$end_query)\n";
195
print WRITE "Hit($start_hit,$end_hit)\n\n";
196
print WRITE $polypeptide_3char, "\n";
197
print WRITE $homo, "\n";
198
print WRITE $seq->subseq($start_hit,$end_hit+$diff), "\n";
203
## write in lindna file
204
print WRITE2 "label", "\n", "Block", "\t", "$start_hit", "\t",
205
"$end_hit", "\t", "3", "\t", "H", "\n";
206
print WRITE2 "Exon", $hsp->rank, "\n";
207
print WRITE2 "endlabel";
211
{print "No lindna file generated\n";}
222
print WRITE2 "endgroup";
223
system ("lindna -infile lindna.lnp -ruler y -blocktype filled -graphout svg");