1
#!/usr/bin/perl -- # -*-Perl-*-
3
# $Id: flanks.pl,v 1.1.2.1 2002/03/10 21:38:29 heikki Exp $
5
# Heikki Lehvaslaiho <heikki@ebi.ac.uk>
6
# Finding flanking sequences for a variant.
8
# Requires BioPerl > 0.7
11
# v. 1.1 9 Aug 2001 interface change, more info in fasta header
12
# v. 2.0 23 Nov 2001 new code from the flanks CGI program
13
# support for EMBL-like positions
22
use constant VERSION => '2.0';
24
my $flank; # Will default to 100
25
my ($file, $pos, $strand);
26
my $in_format = 'EMBL'; # format of the local file on disk
27
my $out_format = 'FASTA'; # output format, going into STDOUT
29
if (@ARGV != 2 and @ARGV != 3 and @ARGV != 4) {
30
print "Usage: flanks (accession or filename) position [length]\n";
38
$flank ||= 100; # defaults to 100
40
&extract($file, $pos, $flank);
47
my ($file, $ids, $flank) = @_;
49
my $IN_FORMAT = 'EMBL'; # format of the local file on disk
50
my $OUT_FORMAT = 'FASTA'; # output format, going into STDOUT
51
my $strand = 1; # default for the forward strand
54
if (-e $file ) { # local file
55
my $in = Bio::SeqIO->new('-file' => $file,
56
'-format' => $IN_FORMAT);
57
$seq = $in->next_seq();
59
elsif ($file =~ /\./) { # sequence version from GenBank
61
my $gb = new Bio::DB::GenBank;
62
$seq = $gb->get_Seq_by_version($file);
64
} else { # plain accession mumber from more reliable EMBL
66
my $gb = new Bio::DB::EMBL;
67
$seq = $gb->get_Seq_by_acc($file);
71
print STDERR "Could not find sequence [$file]" && return unless $seq;
73
my $out = Bio::SeqIO->new('-format' => $OUT_FORMAT);
76
foreach my $idpos (split /\s+/, $ids) {
78
my ($id, $pos_range, $start, $end, $allele_len);
79
my $inbetween = 0; # handle 23^24 notation like plain integer (24)
80
# but set flag and make corrections when needed
82
if ($idpos =~ /:/ ) { # id and position separator
83
($id, $pos_range) = split /:/, $idpos;
89
$strand = -1 if $pos_range =~ /-$/; # opposite strand
90
$pos_range = $1 if $pos_range =~ /(.+)-/; # remove trailing '-'
92
if ($pos_range =~ /\^/) { # notation 23^24 used
93
($start, $end) = split /\^/, $pos_range;
94
print STDERR $id, ": Give adjacent nucleotides flanking '^' character, not [",
95
$start, "] and [", $end, "]\n" and next
96
unless $end == $start + 1;
99
} else { # notation 23..24 used
100
($start, $end) = split /\.\./, $pos_range;
102
$end ||= $start; # notation 23 used
103
print STDERR $id, ": Start can not be larger than end. Not [",
104
$start, "] and [", $end, "]\n" and next
106
$allele_len = $end - $start;
109
next unless defined $start && $start =~ /\d+/ && $start != 0;
110
print STDERR "Position '$start' not in sequence '$file'\n", and next
111
if $start < 1 or $start > $seq->length;
112
print STDERR "End position '$end' not in sequence '$file'\n", and next
113
if $end < 1 or $end > $seq->length;
115
# determine nucleotide positions
117
my $five_start = $start - $flank;
118
$five_start = 1 if $five_start < 1; # not outside the sequence
120
my $three_end = $start + $allele_len + $flank;
121
$three_end = $seq->length if $start + $allele_len + $flank > $seq->length;
122
$three_end-- if $inbetween;
124
# extract the sequnces
125
my $five_prime = lc $seq->subseq($five_start , $start - 1); # left flank
126
my $snp = uc $seq->subseq($start, $end); # allele (always > 0 length)
127
$snp = lc $snp if $inbetween;
129
my $three_prime; # right flank
130
if ($end < $seq->length) { # make sure we are not beyond reference sequece
131
$three_prime = lc $seq->subseq($end + 1, $three_end);
136
# allele positions in local, extracted coordinates
137
my $locpos = length($five_prime) + 1;
140
$loc_end = "..". ($locpos+$allele_len);
143
$loc_end = '^'. ($locpos+1) if $inbetween;
145
# build FASTA id and description line
146
my $fastaid = uc($id). "_". uc($file).
147
" oripos=$pos_range strand=$strand allelepos=$locpos$loc_end";
149
#build BioPerl sequence objects
151
my $five_prime_seq = new Bio::PrimarySeq(-alphabet=>'dna',-seq=>$five_prime);
152
my $snp_seq = new Bio::PrimarySeq(-alphabet=>'dna',-seq=>$snp);
153
my $three_prime_seq = new Bio::PrimarySeq(-alphabet=>'dna',-seq=>$three_prime);
155
my $str = $three_prime_seq->revcom->seq. " ".
156
$snp_seq->revcom->seq. " ". $five_prime_seq->revcom->seq;
158
$out_seq = new Bio::PrimarySeq (-id => $fastaid,
162
my $str = $five_prime. " ". $snp. " ". $three_prime;
164
$out_seq = new Bio::PrimarySeq (-id => $fastaid,
168
$out->write_seq($out_seq); # print sequence out