583
583
Bioperl modules. Send your comments and suggestions preferably to the
584
584
Bioperl mailing lists Your participation is much appreciated.
586
bioperl-l@bioperl.org - General discussion
587
http://bio.perl.org/MailList.html - About the mailing lists
586
bioperl-l@bioperl.org - General discussion
587
http://bioperl.org/wiki/Mailing_lists - About the mailing lists
589
589
=head2 Reporting Bugs
591
591
report bugs to the Bioperl bug tracking system to help us keep track
592
the bugs and their resolution. Bug reports can be submitted via
592
the bugs and their resolution. Bug reports can be submitted via the
595
bioperl-bugs@bio.perl.org
596
http://bugzilla.bioperl.org/
595
http://bugzilla.open-bio.org/
598
597
=head1 AUTHOR - Chris Mungall
610
609
# Let the code begin...
612
611
package Bio::SeqFeature::Tools::Unflattener;
616
614
# Object preamble - inherits from Bio::Root::Root
618
615
use Bio::Location::Simple;
619
616
use Bio::SeqFeature::Generic;
623
@ISA = qw(Bio::Root::Root);
620
use base qw(Bio::Root::Root);
666
663
my @probs = $self->get_problems;
667
664
if (!$self->{_problems_reported} &&
668
665
scalar(@probs)) {
670
"WARNING: There are UNREPORTED PROBLEMS.\n".
667
"WARNING: There are UNREPORTED PROBLEMS.\n".
671
668
"You may wish to use the method report_problems(), \n",
672
"or ignore_problems() on the Unflattener object\n";
669
"or ignore_problems() on the Unflattener object\n");
1143
1144
# use magic to guess the group tag
1144
1145
my @sfs_with_locus_tag =
1145
1146
grep {$_->has_tag("locus_tag")} @flat_seq_features;
1147
my @sfs_with_gene_tag =
1148
grep {$_->has_tag("gene")} @flat_seq_features;
1149
my @sfs_with_product_tag =
1150
grep {$_->has_tag("product")} @flat_seq_features;
1146
1151
if (@sfs_with_locus_tag) {
1147
1152
if ($group_tag && $group_tag ne 'locus_tag') {
1148
1153
$self->throw("You have explicitly set group_tag to be '$group_tag'\n".
1151
1156
"You can resolve this by either NOT setting -group_tag\n".
1152
1157
"OR you can unset -use_magic to regain control");
1154
# I know what's best for you.
1155
# You want to be using /locus_tag=foo here
1160
# use /locus_tag instead of /gene tag for grouping
1157
1161
# see GenBank entry AE003677 (version 3) for an example
1158
1162
$group_tag = 'locus_tag';
1159
if ($self->verbose) {
1160
print "Set group tag to: $group_tag\n";
1163
if ($self->verbose > 0) {
1164
warn "Set group tag to: $group_tag\n";
1168
# on rare occasions, records will have no /gene or /locus_tag
1169
# but it WILL have /product tags. These serve the same purpose
1170
# for grouping. For an example, see AY763288 (also in t/data)
1171
if (@sfs_with_locus_tag==0 &&
1172
@sfs_with_gene_tag==0 &&
1173
@sfs_with_product_tag>0 &&
1175
$group_tag = 'product';
1176
if ($self->verbose > 0) {
1177
warn "Set group tag to: $group_tag\n";
1164
1182
if (!$group_tag) {
1278
# PSEUDOGENES, PSEUDOEXONS AND PSEUDOINTRONS
1279
# these are indicated with the /pseudo tag
1280
# these are mapped to a different type; they should NOT
1281
# be treated as normal genes
1282
foreach my $sf (@all_seq_features) {
1283
if ($sf->has_tag('pseudo')) {
1284
my $type = $sf->primary_tag;
1285
# SO type is typically the same as the normal
1286
# type but preceeded by "pseudo"
1287
if ($type eq 'misc_RNA') {
1288
$sf->primary_tag("pseudotranscript");
1291
$sf->primary_tag("pseudo$type");
1295
# now some of the post-processing that follows which applies to
1296
# genes will NOT be applied to pseudogenes; this is deliberate
1297
# for example, gene models are normalised to be gene-transcript-exon
1298
# for pseudogenes we leave them as pseudogene-pseudoexon
1260
1300
# --- MAGIC ---
1261
1301
my $need_to_infer_exons = 0;
1262
1302
my $need_to_infer_mRNAs = 0;
1276
1316
$_->has_tag($group_tag)} @flat_seq_features);
1278
1318
scalar(grep {$_->primary_tag eq 'CDS'} @flat_seq_features);
1320
scalar(grep {$_->primary_tag =~ /RNA/} @flat_seq_features);
1280
1321
# Are there any CDS features in the record?
1284
1325
# - a pc gene model should contain at the least a CDS
1286
1327
# Are there any mRNA features in the record?
1328
if ($n_mrnas == 0) {
1288
1330
# looks like structure_type == 1
1289
1331
$structure_type = 1;
1290
1332
$need_to_infer_mRNAs = 1;
1292
elsif (!$n_mrnas_attached_to_gene) {
1293
# The files _does_ have at least one mRNA feature,
1294
# but none of them are part of a group, i.e. they
1334
elsif ($n_mrnas_attached_to_gene == 0) {
1336
# $n_mrnas_attached_to_gene = 0
1338
# The entries _do_ contain mRNA features,
1339
# but none of them are part of a group/gene, i.e. they
1295
1340
# are 'floating'
1297
1342
# this is an annoying weird file that has some floating
1298
1343
# mRNA features;
1299
1344
# eg ftp.ncbi.nih.gov/genomes/Schizosaccharomyces_pombe/
1346
if ($self->verbose) {
1347
my @floating_mrnas =
1348
grep {$_->primary_tag eq 'mRNA' &&
1349
!$_->has_tag($group_tag)} @flat_seq_features;
1350
printf STDERR "Unattached mRNAs:\n";
1351
foreach my $mrna (@floating_mrnas) {
1352
$self->_write_sf_detail($mrna);
1354
printf STDERR "Don't know how to deal with these; filter at source?\n";
1301
1357
foreach (@flat_seq_features) {
1302
1358
if ($_->primary_tag eq 'mRNA') {
1352
if ($self->verbose) {
1411
if ($self->verbose > 0) {
1412
printf STDERR "GROUPS:\n";
1354
1413
foreach my $group (@groups) {
1355
1414
$self->_write_group($group, $group_tag);
1360
# >>>>>>>>> <<<<<<<<<<<<<
1361
1419
# --------- FINISHED GROUPING -------------
1362
# >>>>>>>>> <<<<<<<<<<<<<
1422
# TYPE CONTAINMENT HIERARCHY (aka partonomy)
1364
1423
# set the containment hierarchy if desired
1365
1424
# see docs for structure_type() method
1366
1425
if ($structure_type) {
1367
1426
if ($structure_type == 1) {
1368
1427
$self->partonomy(
1376
1435
$self->throw("structure_type $structure_type is currently unknown");
1435
1492
$seq->remove_SeqFeatures;
1436
1493
$seq->add_SeqFeature(@top_sfs);
1438
# >>>>>>>>> <<<<<<<<<<<<<
1439
1495
# --------- FINISHED UNFLATTENING -------------
1440
# >>>>>>>>> <<<<<<<<<<<<<
1442
1497
# lets see if there are any post-unflattening tasks we need to do
1444
1501
# INFERRING mRNAs
1445
1502
if ($need_to_infer_mRNAs) {
1446
if ($self->verbose) {
1447
print "** INFERRING mRNA from CDS\n";
1503
if ($self->verbose > 0) {
1504
printf STDERR "** INFERRING mRNA from CDS\n";
1449
1506
$self->infer_mRNA_from_CDS(-seq=>$seq);
1541
1600
my $thresh = $self->error_threshold;
1542
1601
my @bad_problems = grep {$_->[0] > $thresh} @problems;
1543
1602
if (@bad_problems) {
1544
print STDERR "PROBLEM:\n";
1603
printf STDERR "PROBLEM:\n";
1545
1604
$self->_write_hier(\@top_sfs);
1546
1605
# TODO - allow more fine grained control over this
1547
1606
$self->{_problems_reported} = 1;
1993
2052
if (%unresolved) {
1994
2053
my %childh = map {$_=>1} keys %unresolved;
1995
2054
my %parenth = map {$_->[0]=>1} map {@$_} values %unresolved;
1996
if ($self->verbose) {
1997
printf "MATCHING %d CHILDREN TO %d PARENTS\n",
2055
if ($self->verbose > 0) {
2056
printf STDERR "MATCHING %d CHILDREN TO %d PARENTS\n",
1998
2057
scalar(keys %childh), scalar(keys %parenth);
2000
2059
# 99.99% of the time in genbank genomic record of structure type 0, we
2018
# The graph %container
2020
2076
# DEBUGGING CODE
2021
if ($self->verbose && scalar(keys %unresolved)) {
2022
print "UNRESOLVED PAIRS:\n";
2077
if ($self->verbose > 0 && scalar(keys %unresolved)) {
2078
printf STDERR "UNRESOLVED PAIRS:\n";
2023
2079
foreach my $childsf (keys %unresolved) {
2024
2080
my @poss = @{$unresolved{$childsf}};
2025
2081
foreach my $p (@poss) {
2026
2082
my $parentsf = $p->[0];
2027
2083
$childsf = $idxsf{$childsf};
2028
my @clabels = $childsf->get_tagset_values(qw(protein_id label product));
2029
my @plabels = $parentsf->get_tagset_values(qw(transcript_id label product));
2030
printf(" PAIR: $clabels[0] => $plabels[0] (of %d)\n",
2084
my @clabels = ($childsf->get_tagset_values(qw(protein_id label product)), "?");
2085
my @plabels = ($parentsf->get_tagset_values(qw(transcript_id label product)), "?");
2087
(" PAIR: $clabels[0] => $plabels[0] (of %d)\n",
2034
2091
} # -- end of verbose
2053
2110
$unresolved_problem_reported = 1;
2055
2112
foreach my $pair (@$new_pairs) {
2056
if ($self->verbose) {
2057
printf " resolved pair @$pair\n";
2113
if ($self->verbose > 0) {
2114
printf STDERR " resolved pair @$pair\n";
2059
2116
$container{$pair->[0]} = $pair->[1];
2060
2117
delete $unresolved{$pair->[0]};
2073
2130
foreach my $sf (@sfs) {
2074
2131
my $container_sf = $container{$sf};
2075
2132
if ($container_sf) {
2077
# make $sf nested inside $container_sf
2079
# this will throw an exception if $sf is
2080
# not spatially within $container_sf
2133
# make $sf nested inside $container_sf
2135
# first check if the container spatially contains the containee
2136
if ($container_sf->contains($sf)) {
2081
2138
$container_sf->add_SeqFeature($sf);
2084
# probably a spatial containment problem
2086
"bioperl add_SeqFeature says:$@",
2141
# weird case - the container does NOT spatially
2142
# contain the containee;
2143
# we expand and throw a warning
2145
# for an example of this see ZFP91-CNTF dicistronic gene
2146
# in NCBI chrom 11 build 34.3
2148
"Container feature does not spatially contain ".
2149
"subfeature. Perhaps this is a dicistronic gene? ".
2150
"I am expanding the parent feature",
2153
$container_sf->add_SeqFeature($sf, 'EXPAND');
2092
2157
push(@top, $sf);
2132
2197
my $verbose = $self->verbose;
2133
2198
#################################print "I";
2135
printf "find_best_matches: (/%d)\n", scalar(@$pairs);
2200
printf STDERR "find_best_matches: (/%d)\n", scalar(@$pairs);
2138
2203
my %selected_children = map {($_->[0]=>1)} @$pairs;
2231
2296
my $group_tag = shift || 'gene';
2233
2298
my $f = $group->[0];
2235
2300
if ($f->has_tag($group_tag)) {
2236
2301
($label) = $f->get_tag_values($group_tag);
2238
printf(" GROUP [%s]:%s\n",
2241
map { $_->primary_tag } @$group));
2303
if( $self->verbose > 0 ) {
2304
printf STDERR (" GROUP [%s]:%s\n",
2307
map { $_->primary_tag } @$group));
2245
2312
sub _write_sf {
2246
2313
my $self = shift;
2247
2314
my $sf = shift;
2248
printf "TYPE:%s\n", $sf->primary_tag;
2315
printf STDERR "TYPE:%s\n", $sf->primary_tag;
2252
2319
sub _write_sf_detail {
2253
2320
my $self = shift;
2254
2321
my $sf = shift;
2255
printf "TYPE:%s\n", $sf->primary_tag;
2322
printf STDERR "TYPE:%s\n", $sf->primary_tag;
2256
2323
my @locs = $sf->location->each_Location;
2257
printf " %s,%s [%s]\n", $_->start, $_->end, $_->strand foreach @locs;
2324
printf STDERR " %s,%s [%s]\n", $_->start, $_->end, $_->strand foreach @locs;
2262
2329
my $self = shift;
2263
2330
my @sfs = @{shift || []};
2264
2331
my $indent = shift || 0;
2265
foreach my $sf (@sfs) {
2267
if ($sf->has_tag('product')) {
2268
($label) = $sf->get_tag_values('product');
2270
printf "%s%s $label\n", ' ' x $indent, $sf->primary_tag;
2271
my @sub_sfs = $sf->sub_SeqFeature;
2272
$self->_write_hier(\@sub_sfs, $indent+1);
2332
if( $self->verbose > 0 ) {
2333
foreach my $sf (@sfs) {
2335
if ($sf->has_tag('product')) {
2336
($label) = $sf->get_tag_values('product');
2338
printf STDERR "%s%s $label\n", ' ' x $indent, $sf->primary_tag;
2339
my @sub_sfs = $sf->sub_SeqFeature;
2340
$self->_write_hier(\@sub_sfs, $indent+1);
2390
2459
$sf->isa("Bio::SeqFeatureI") || $self->throw("$sf NOT A SeqFeatureI");
2391
2460
$sf->isa("Bio::FeatureHolderI") || $self->throw("$sf NOT A FeatureHolderI");
2393
# so far, we only infer exons from mRNA
2394
2462
my $type = $sf->primary_tag;
2395
2463
next unless $type eq 'mRNA' or $type =~ /RNA/;
2397
2465
# an mRNA from genbank will have a discontinuous location,
2398
2466
# with each sub-location being equivalent to an exon
2399
2467
my @locs = $sf->location;
2400
2469
if ($sf->location->isa("Bio::Location::SplitLocationI")) {
2401
2470
@locs = $sf->location->each_Location;
2476
$self->throw("ASSERTION ERROR: sf has no location objects");
2404
2479
# make exons from locations
2407
2482
my $subsf = Bio::SeqFeature::Generic->new(-location=>$_,
2408
2483
-primary_tag=>'exon');
2484
## Provide seq_id to new feature:
2485
$subsf->seq_id($sf->seq_id) if $sf->seq_id;
2486
$subsf->source_tag($sf->source_tag) if $sf->source_tag;
2487
## Transfer /locus_tag and /gene tag values to inferred
2488
## features. TODO: Perhaps? this should not be done
2489
## indiscriminantly but rather by virtue of the setting
2491
foreach my $tag (grep /gene|locus_tag/, $sf->get_all_tags) {
2492
my @vals = $sf->get_tag_values($tag);
2493
$subsf->add_tag_value($tag, @vals);
2409
2496
my $locstr = 'exon::'.$self->_locstr($subsf);
2411
2498
# re-use feature if type and location the same
2421
2508
# PARANOID CHECK
2423
$self->_check_order_is_consistent(@subsfs);
2425
print "Unordered features:\n";
2426
$self->_write_sf_detail($_) foreach @subsfs;
2427
$self->throw("ASSERTION ERROR: inconsistent order");
2509
$self->_check_order_is_consistent($sf->location->strand,@subsfs);
2431
2512
$sf->location(Bio::Location::Simple->new());
2514
# we allow the exons to define the boundaries of the transcript
2432
2515
$sf->add_SeqFeature($_, 'EXPAND') foreach @subsfs;
2526
2612
my @mrnas = ();
2527
2613
foreach my $cds (@cdsl) {
2531
$self->_check_order_is_consistent($cds->location->each_Location);
2533
$self->_write_sf_detail($cds);
2534
$self->throw("inconsistent order");
2615
if ($self->verbose > 0) {
2616
print " Inferring mRNA from CDS $cds\n";
2618
$self->_check_order_is_consistent($cds->location->strand,$cds->location->each_Location);
2537
2620
my $loc = Bio::Location::Split->new;
2538
2621
foreach my $cdsexonloc ($cds->location->each_Location) {
2547
2630
Bio::SeqFeature::Generic->new(-location=>$loc,
2548
2631
-primary_tag=>'mRNA');
2551
$self->_check_order_is_consistent($mrna->location->each_Location);
2553
$self->throw("inconsistent order");
2633
## Provide seq_id to new feature:
2634
$mrna->seq_id($cds->seq_id) if $cds->seq_id;
2635
$mrna->source_tag($cds->source_tag) if $cds->source_tag;
2637
$self->_check_order_is_consistent($mrna->location->strand,$mrna->location->each_Location);
2639
# make the mRNA hold the CDS; no EXPAND option,
2640
# the CDS cannot be wider than the mRNA
2555
2641
$mrna->add_SeqFeature($cds);
2556
2643
# mRNA steals children of CDS
2557
2644
foreach my $subsf ($cds->get_SeqFeatures) {
2558
2645
$mrna->add_SeqFeature($subsf);
2699
# _check_order_is_consistent($strand,$ranges) RETURNS BOOL
2701
# note: the value of this test is moot - there are many valid,
2702
# if unusual cases where it would flag an anomaly. for example
2703
# transpliced genes such as mod(mdg4) in dmel on AE003744, and
2704
# the following spliced gene on NC_001284:
2706
# mRNA complement(join(20571..20717,21692..22086,190740..190761,
2707
# 140724..141939,142769..142998))
2709
# /note="trans-splicing, RNA editing"
2710
# /db_xref="GeneID:814567"
2712
# note how the exons are not in order
2713
# this will flag a level-3 warning, the user of this module
2714
# can ignore this and deal appropriately with the resulting
2612
2716
sub _check_order_is_consistent {
2613
2717
my $self = shift;
2719
my $parent_strand = shift; # this does nothing..?
2614
2720
my @ranges = @_;
2615
2721
return unless @ranges;
2723
join(" ",map{sprintf("[%s,%s]",$_->start,$_->end)} @ranges);
2616
2724
my $strand = $ranges[0]->strand;
2617
2725
for (my $i=1; $i<@ranges;$i++) {
2618
2726
if ($ranges[$i]->strand != $strand) {
2619
return 1; # mixed ranges - autopass
2727
$self->problem(1,"inconsistent strands. Trans-spliced gene? Range: $rangestr");
2729
# mixed ranges - autopass
2730
# some mRNAs have exons on both strands; for
2731
# example, the dmel mod(mdg4) gene which is
2732
# trans-spliced (in actual fact two mRNAs)
2622
2736
for (my $i=1; $i<@ranges;$i++) {
2623
2737
my $rangeP = $ranges[$i-1];
2624
2738
my $range = $ranges[$i];
2625
# if ($strand < 0) {
2626
# if ($rangeP->end < $range->start) {
2631
2739
if ($rangeP->start > $range->end) {
2740
# failed - but still get one more chance..
2742
$self->problem(2,"Ranges not in correct order. Strange ensembl genbank entry? Range: $rangestr");
2748
# sometimes (eg ensembl flavour genbank files)
2749
# exons on reverse strand listed in reverse order
2750
# eg join(complement(R1),...,complement(Rn))
2752
for (my $i=1; $i<@ranges;$i++) {
2753
my $rangeP = $ranges[$i-1];
2754
my $range = $ranges[$i];
2755
if ($rangeP->end < $range->start) {
2756
$self->problem(3,"inconsistent order. Range: $rangestr");
2636
2761
return 1; # pass