2
# Author: Jason Stajich <jason-at-bioperl-dot-org>
7
bp_mask_by_search - mask sequence(s) based on its alignment results
11
bp_mask_by_search.pl -f blast genomefile blastfile.bls > maskedgenome.fa
15
Mask sequence based on significant alignments of another sequence.
16
You need to provide the report file and the entire sequence data which
17
you want to mask. By default this will assume you have done a TBLASTN
18
(or TFASTY) and try and mask the hit sequence assuming you've provided
19
the sequence file for the hit database. If you would like to do the
20
reverse and mask the query sequence specify the -t/--type query flag.
22
This is going to read in the whole sequence file into memory so for
23
large genomes this may fall over. I'm using DB_File to prevent
24
keeping everything in memory, one solution is to split the genome into
25
pieces (BEFORE you run the DB search though, you want to use the exact
26
file you BLASTed with as input to this program).
28
Below the double dash (--) options are of the form
29
--format=fasta or --format fasta
33
By -f/--format I mean either are acceptable options. The =s or =n
34
or =c specify these arguments expect a 'string'
37
-f/--format=s Search report format (fasta,blast,axt,hmmer,etc)
38
-sf/--sformat=s Sequence format (fasta,genbank,embl,swissprot)
39
--hardmask (booelean) Hard mask the sequence
40
with the maskchar [default is lowercase mask]
41
--maskchar=c Character to mask with [default is N], change
42
to 'X' for protein sequences
43
-e/--evalue=n Evalue cutoff for HSPs and Hits, only
44
mask sequence if alignment has specified evalue
47
--outfile=file Output file to save the masked sequence to.
48
-t/--type=s Alignment seq type you want to mask, the
49
'hit' or the 'query' sequence. [default is 'hit']
50
--minlen=n Minimum length of an HSP for it to be used
51
in masking [default 0]
52
-h/--help See this help information
54
=head1 AUTHOR - Jason Stajich
56
Jason Stajich, jason-at-bioperl-dot-org.
68
# assuming tblastn or tfasty type alignment
75
my $hardmask = 0; # mask with $maskchar instead of lowercase
76
my $maskchar = 'N'; # if we hard mask, mask with this cahr
79
'f|format:s' => \$format,
80
'sf|sformat:s'=> \$sformat,
81
'hardmask' => \$hardmask,
82
'maskchar:s' => \$maskchar,
83
'e|evalue:s' => \$evalue,
84
'o|out|outfile:s' => \$outfile,
86
'minlen:s' => \$minlen,
87
'h|help' => sub { system('perldoc', $0);
90
if( $type !~ /^(hit|query)/i ) {
91
die("type must be query or hit[default] not $type") ;
95
if(length($maskchar) > 1 ) {
96
die("expected a mask character, not a string (you gave $maskchar)");
98
my $genomefile = shift || die('need a file containing the genome');
99
my $reportfile = shift;
101
# this could be problem for large genomes, figure out a
102
# better way to do this later on
103
# or force people to split it up
104
my $genomeparser = new Bio::SeqIO(-file => $genomefile,
107
unlink('/tmp/genome.idx');
108
tie(%seqs,'DB_File','/tmp/genome.idx');
109
while( my $seq = $genomeparser->next_seq ) {
110
# should we pre-force to upper case?
111
$seqs{$seq->display_id} = $seq->seq();
114
my $parser = new Bio::SearchIO(-file => $reportfile,
117
while( my $r = $parser->next_result ) {
118
while( my $h = $r->next_hit ) {
119
last if( defined $evalue && $h->significance > $evalue );
120
my $hname = $h->name;
121
if( ! $seqs{$hname} ) {
122
die("Cannot find sequence $hname in genome seq");
124
while( my $hsp = $h->next_hsp ) {
125
last if( defined $evalue && $hsp->evalue > $evalue );
126
next if( $hsp->length('total') < $minlen);
127
my ($s,$len) = ( $hsp->$type()->start,
128
$hsp->$type()->length);
131
substr($seqs{$hname}, $s,$len, $maskchar x $len);
133
substr($seqs{$hname}, $s,$len,
134
lc(substr($seqs{$hname}, $s,$len)));
142
$out = new Bio::SeqIO(-file => ">$outfile",
143
-format => $sformat);
145
$out = new Bio::SeqIO(-format => $sformat);
148
while( my ($seqname,$seq) = each %seqs ) {
149
$out->write_seq(Bio::Seq->new(-seq => $seq,
150
-display_id => $seqname,
151
-description=> 'MASKED'));
154
unlink('/tmp/genome.idx');