~ubuntu-branches/ubuntu/trusty/bioperl/trusty

Viewing changes to scripts/Bio-DB-GFF/bp_genbank2gff3.pl

Committer: Package Import Robot
Author(s): Charles Plessy
Date: 2013-09-22 13:39:48 UTC
mfrom: (3.1.11 sid)
Revision ID: package-import@ubuntu.com-20130922133948-c6z62zegjyp7ztou

Tags: 1.6.922-1

http://bugs.debian.org/722910

* New upstream release.
* Replaces and Breaks grinder (<< 0.5.3-3~) because of overlaping contents.
Closes: #722910
* Stop Replacing and Breaking bioperl ( << 1.6.9 ): not needed anymore.

files added:
.travis.yml

Bio/DB/IndexedBase.pm

Bio/DB/Taxonomy/greengenes.pm

Bio/DB/Taxonomy/silva.pm

Bio/Seq/SimulatedRead.pm

Bio/SeqFeature/Amplicon.pm

Bio/SeqFeature/SubSeq.pm

Bio/Tools/AmpliconSearch.pm

README.md

examples/align/FastAlign.pl

scripts/Bio-DB-GFF/bp_bulk_load_gff.pl

scripts/Bio-DB-GFF/bp_fast_load_gff.pl

scripts/Bio-DB-GFF/bp_genbank2gff.pl

scripts/Bio-DB-GFF/bp_genbank2gff3.pl

scripts/Bio-DB-GFF/bp_generate_histogram.pl

scripts/Bio-DB-GFF/bp_load_gff.pl

scripts/Bio-DB-GFF/bp_meta_gff.pl

scripts/Bio-DB-GFF/bp_process_gadfly.pl

scripts/Bio-DB-GFF/bp_process_sgd.pl

scripts/Bio-DB-GFF/bp_process_wormbase.pl

scripts/Bio-DB-SeqFeature-Store/bp_seqfeature_delete.pl

scripts/Bio-DB-SeqFeature-Store/bp_seqfeature_gff3.pl

scripts/Bio-DB-SeqFeature-Store/bp_seqfeature_load.pl

scripts/DB-HIV/bp_hivq.pl

scripts/DB/bp_biofetch_genbank_proxy.pl

scripts/DB/bp_bioflat_index.pl

scripts/DB/bp_biogetseq.pl

scripts/DB/bp_flanks.pl

scripts/das/bp_das_server.pl

scripts/index/bp_fetch.pl

scripts/index/bp_index.pl

scripts/index/bp_seqret.pl

scripts/popgen/bp_composite_LD.pl

scripts/popgen/bp_heterogeneity_test.pl

scripts/searchio/bp_fastam9_to_table.pl

scripts/searchio/bp_filter_search.pl

scripts/searchio/bp_hmmer_to_table.pl

scripts/searchio/bp_parse_hmmsearch.pl

scripts/searchio/bp_search2table.pl

scripts/seq/bp_extract_feature_seq.pl

scripts/seq/bp_make_mrna_protein.pl

scripts/seq/bp_seqconvert.pl

scripts/seq/bp_seqcut.pl

scripts/seq/bp_seqpart.pl

scripts/seq/bp_seqretsplit.pl

scripts/seq/bp_split_seq.pl

scripts/seq/bp_translate_seq.pl

scripts/seq/bp_unflatten_seq.pl

scripts/seqstats/bp_aacomp.pl

scripts/seqstats/bp_chaos_plot.pl

scripts/seqstats/bp_gccalc.pl

scripts/seqstats/bp_oligo_count.pl

scripts/taxa/bp_classify_hits_kingdom.pl

scripts/taxa/bp_local_taxonomydb_query.pl

scripts/taxa/bp_query_entrez_taxa.pl

scripts/taxa/bp_taxid4species.pl

scripts/taxa/bp_taxonomy2tree.pl

scripts/tree/bp_blast2tree.pl

scripts/tree/bp_nexus2nh.pl

scripts/tree/bp_tree2pag.pl

scripts/utilities/bp_dbsplit.pl

scripts/utilities/bp_download_query_genbank.pl

scripts/utilities/bp_mask_by_search.pl

scripts/utilities/bp_mrtrans.pl

scripts/utilities/bp_mutate.pl

scripts/utilities/bp_netinstall.pl

scripts/utilities/bp_nrdb.pl

scripts/utilities/bp_pairwise_kaks.pl

scripts/utilities/bp_remote_blast.pl

scripts/utilities/bp_revtrans-motif.pl

scripts/utilities/bp_search2BSML.pl

scripts/utilities/bp_search2alnblocks.pl

scripts/utilities/bp_search2gff.pl

scripts/utilities/bp_search2tribe.pl

scripts/utilities/bp_seq_length.pl

scripts/utilities/bp_sreformat.pl

t/Cluster

t/Cluster/UniGene.t

t/LocalDB/Fasta.t

t/LocalDB/Qual.t

t/LocalDB/Taxonomy

t/LocalDB/Taxonomy/greengenes.t

t/LocalDB/Taxonomy/silva.t

t/Seq/SimulatedRead.t

t/SeqFeature/Amplicon.t

t/SeqFeature/Collection.t

t/SeqFeature/Computation.t

t/SeqFeature/FeaturePair.t

t/SeqFeature/Gene.t

t/SeqFeature/Generic.t

t/SeqFeature/SubSeq.t

t/Tools/AmpliconSearch.t

t/data/NC_000007-ribosomal-slippage.gb

t/data/NC_002058_multDBLINK_bug3375.gb

t/data/bad_dbfa/shotdb.fa

t/data/badfasta.fa

t/data/bug3331.mlc

t/data/codeml45.mlc

t/data/codeml45b.mlc

t/data/dbfa/mixed_alphabet.fasta

t/data/exonerate.output.negativescore.works

t/data/fastq/zero_qual.fastq

t/data/forward_primer.fa

t/data/forward_reverse_primers.fa

t/data/hmmpfam_HSPdashline.txt

t/data/hmmpfam_multiresult.out

t/data/msout/bad_msout_infile1

t/data/msout/bad_msout_infile2

t/data/multiresult_blastn+.bls

t/data/nhmmer-3.1.out

t/data/pfamOutput-bug3376.out

t/data/rpsblast_no_hits.bls

t/data/singlescore.gbk

t/data/spaced_fasta.fa

t/data/taxonomy

t/data/taxonomy/greengenes_taxonomy_16S_candiv_gg_2011_1.txt

t/data/taxonomy/silva_SSURef_108_tax_silva_trunc.fasta

t/data/test-3.0-1.meme

t/data/test-3.0-2.meme

t/data/test-4.9.meme

t/data/tiny.stk

t/data/yn00_45.mlc

travis_scripts

travis_scripts/dependency_installs

files removed:
.pc/fix_berkeleydb3_temp_file_creation

.pc/fix_berkeleydb3_temp_file_creation/Bio

.pc/fix_berkeleydb3_temp_file_creation/Bio/DB

.pc/fix_berkeleydb3_temp_file_creation/Bio/DB/SeqFeature

.pc/fix_berkeleydb3_temp_file_creation/Bio/DB/SeqFeature/Store

.pc/fix_berkeleydb3_temp_file_creation/Bio/DB/SeqFeature/Store/berkeleydb3.pm

Bio/Biblio

Bio/Biblio.pm

Bio/Biblio/Article.pm

Bio/Biblio/BiblioBase.pm

Bio/Biblio/Book.pm

Bio/Biblio/BookArticle.pm

Bio/Biblio/IO

Bio/Biblio/IO.pm

Bio/Biblio/IO/medline2ref.pm

Bio/Biblio/IO/medlinexml.pm

Bio/Biblio/IO/pubmed2ref.pm

Bio/Biblio/IO/pubmedxml.pm

Bio/Biblio/Journal.pm

Bio/Biblio/JournalArticle.pm

Bio/Biblio/MedlineArticle.pm

Bio/Biblio/MedlineBook.pm

Bio/Biblio/MedlineBookArticle.pm

Bio/Biblio/MedlineJournal.pm

Bio/Biblio/MedlineJournalArticle.pm

Bio/Biblio/Organisation.pm

Bio/Biblio/Patent.pm

Bio/Biblio/Person.pm

Bio/Biblio/Proceeding.pm

Bio/Biblio/Provider.pm

Bio/Biblio/PubmedArticle.pm

Bio/Biblio/PubmedBookArticle.pm

Bio/Biblio/PubmedJournalArticle.pm

Bio/Biblio/Ref.pm

Bio/Biblio/Service.pm

Bio/Biblio/TechReport.pm

Bio/Biblio/Thesis.pm

Bio/Biblio/WebResource.pm

Bio/DB/Biblio

Bio/DB/Biblio/biofetch.pm

Bio/DB/Biblio/eutils.pm

Bio/DB/Biblio/soap.pm

Bio/DB/BiblioI.pm

Bio/DB/EUtilities.pm

Bio/FeatureIO

Bio/FeatureIO.pm

Bio/FeatureIO/bed.pm

Bio/FeatureIO/gff.pm

Bio/FeatureIO/gtf.pm

Bio/FeatureIO/interpro.pm

Bio/FeatureIO/ptt.pm

Bio/FeatureIO/vecscreen_simple.pm

Bio/Ontology/SimpleGOEngine/GraphAdaptor02.pm

Bio/Root/Test

Bio/Root/Test/Warn.pm

Bio/Search/HSP/hmmer3HSP.pm

Bio/SeqFeature/Annotated.pm

Bio/Tools/EUtilities

Bio/Tools/EUtilities.pm

Bio/Tools/EUtilities/EUtilDataI.pm

Bio/Tools/EUtilities/EUtilParameters.pm

Bio/Tools/EUtilities/History.pm

Bio/Tools/EUtilities/HistoryI.pm

Bio/Tools/EUtilities/Info

Bio/Tools/EUtilities/Info.pm

Bio/Tools/EUtilities/Info/FieldInfo.pm

Bio/Tools/EUtilities/Info/LinkInfo.pm

Bio/Tools/EUtilities/Link

Bio/Tools/EUtilities/Link.pm

Bio/Tools/EUtilities/Link/LinkSet.pm

Bio/Tools/EUtilities/Link/UrlLink.pm

Bio/Tools/EUtilities/Query

Bio/Tools/EUtilities/Query.pm

Bio/Tools/EUtilities/Query/GlobalQuery.pm

Bio/Tools/EUtilities/Summary

Bio/Tools/EUtilities/Summary.pm

Bio/Tools/EUtilities/Summary/DocSum.pm

Bio/Tools/EUtilities/Summary/Item.pm

Bio/Tools/EUtilities/Summary/ItemContainerI.pm

debian/patches/fix_berkeleydb3_temp_file_creation

examples/biblio

examples/biblio/biblio-eutils-example.pl

examples/biblio/biblio-soap-example.pl

examples/biblio/biblio_soap.pl

scripts/Bio-DB-EUtilities

scripts/Bio-DB-EUtilities/einfo.PLS

scripts/Bio-DB-GFF/bulk_load_gff.PLS

scripts/Bio-DB-GFF/fast_load_gff.PLS

scripts/Bio-DB-GFF/genbank2gff.PLS

scripts/Bio-DB-GFF/genbank2gff3.PLS

scripts/Bio-DB-GFF/generate_histogram.PLS

scripts/Bio-DB-GFF/load_gff.PLS

scripts/Bio-DB-GFF/meta_gff.PLS

scripts/Bio-DB-GFF/process_gadfly.PLS

scripts/Bio-DB-GFF/process_sgd.PLS

scripts/Bio-DB-GFF/process_wormbase.PLS

scripts/Bio-DB-SeqFeature-Store/bp_seqfeature_delete.PLS

scripts/Bio-DB-SeqFeature-Store/bp_seqfeature_gff3.PLS

scripts/Bio-DB-SeqFeature-Store/bp_seqfeature_load.PLS

scripts/DB-HIV/hivq.PLS

scripts/DB/biofetch_genbank_proxy.PLS

scripts/DB/bioflat_index.PLS

scripts/DB/biogetseq.PLS

scripts/DB/flanks.PLS

scripts/biblio

scripts/biblio/TAG

scripts/biblio/biblio.PLS

scripts/das/das_server.pl

scripts/index/bp_fetch.PLS

scripts/index/bp_index.PLS

scripts/index/bp_seqret.PLS

scripts/popgen/composite_LD.PLS

scripts/popgen/heterogeneity_test.PLS

scripts/searchio/fastam9_to_table.PLS

scripts/searchio/filter_search.PLS

scripts/searchio/hmmer_to_table.PLS

scripts/searchio/parse_hmmsearch.PLS

scripts/searchio/search2table.PLS

scripts/seq/extract_feature_seq.PLS

scripts/seq/make_mrna_protein.PLS

scripts/seq/seqconvert.PLS

scripts/seq/seqretsplit.PLS

scripts/seq/split_seq.PLS

scripts/seq/translate_seq.PLS

scripts/seq/unflatten_seq.PLS

scripts/seqstats/aacomp.PLS

scripts/seqstats/chaos_plot.PLS

scripts/seqstats/gccalc.PLS

scripts/seqstats/oligo_count.PLS

scripts/taxa/classify_hits_kingdom.PLS

scripts/taxa/local_taxonomydb_query.PLS

scripts/taxa/query_entrez_taxa.PLS

scripts/taxa/taxid4species.PLS

scripts/taxa/taxonomy2tree.PLS

scripts/tree/blast2tree.PLS

scripts/tree/nexus2nh.PLS

scripts/tree/tree2pag.PLS

scripts/utilities/bp_mrtrans.PLS

scripts/utilities/bp_netinstall.PLS

scripts/utilities/bp_nrdb.PLS

scripts/utilities/bp_sreformat.PLS

scripts/utilities/dbsplit.PLS

scripts/utilities/download_query_genbank.PLS

scripts/utilities/mask_by_search.PLS

scripts/utilities/mutate.PLS

scripts/utilities/pairwise_kaks.PLS

scripts/utilities/remote_blast.PLS

scripts/utilities/revtrans-motif.PLS

scripts/utilities/search2BSML.PLS

scripts/utilities/search2alnblocks.PLS

scripts/utilities/search2gff.PLS

scripts/utilities/search2tribe.PLS

scripts/utilities/seq_length.PLS

t/Biblio

t/Biblio/Biblio.t

t/Biblio/References.t

t/Biblio/biofetch.t

t/Biblio/eutils.t

t/LocalDB/DBFasta.t

t/LocalDB/DBQual.t

t/RemoteDB/EUtilities.t

t/SeqFeature/FeatureIO.t

t/SeqFeature/SeqFeatAnnotated.t

t/SeqFeature/SeqFeatCollection.t

t/SeqFeature/SeqFeature.t

t/SeqFeature/SeqFeaturePrimer.t

t/Tools/EUtilities

t/Tools/EUtilities/EUtilParameters.t

t/Tools/EUtilities/egquery.t

t/Tools/EUtilities/einfo.t

t/Tools/EUtilities/elink_acheck.t

t/Tools/EUtilities/elink_lcheck.t

t/Tools/EUtilities/elink_llinks.t

t/Tools/EUtilities/elink_ncheck.t

t/Tools/EUtilities/elink_neighbor.t

t/Tools/EUtilities/elink_neighbor_history.t

t/Tools/EUtilities/elink_scores.t

t/Tools/EUtilities/epost.t

t/Tools/EUtilities/esearch.t

t/Tools/EUtilities/espell.t

t/Tools/EUtilities/esummary.t

t/data/1.bed

t/data/directives.gff3

t/data/eutils

t/data/eutils/egquery.xml

t/data/eutils/einfo.xml

t/data/eutils/einfo_dbs.xml

t/data/eutils/elink_acheck.xml

t/data/eutils/elink_acheck_corr.xml

t/data/eutils/elink_dball.xml

t/data/eutils/elink_lcheck.xml

t/data/eutils/elink_lcheck_corr.xml

t/data/eutils/elink_llinks.xml

t/data/eutils/elink_llinks_corr.xml

t/data/eutils/elink_multidb.xml

t/data/eutils/elink_multidb_corr.xml

t/data/eutils/elink_ncheck.xml

t/data/eutils/elink_ncheck_corr.xml

t/data/eutils/elink_neighbor.xml

t/data/eutils/elink_neighbor_corr.xml

t/data/eutils/elink_nhist.xml

t/data/eutils/elink_nhist_corr.xml

t/data/eutils/elink_scores.xml

t/data/eutils/epost.xml

t/data/eutils/esearch1.xml

t/data/eutils/esearch2.xml

t/data/eutils/espell.xml

t/data/eutils/esummary1.xml

t/data/eutils/esummary2.xml

t/data/hybrid1.gff3

t/data/knownGene.gff3

t/data/stress_test_medline.xml

t/data/stress_test_pubmed.xml

t/data/test.meme

t/data/test.meme2

t/data/test.ptt

t/data/vecscreen_simple.test_output

t/lib/Array

t/lib/Array/Compare.pm

t/lib/Sub

t/lib/Sub/Uplevel.pm

t/lib/Test

t/lib/Test/Builder

t/lib/Test/Builder.pm

t/lib/Test/Builder/Module.pm

t/lib/Test/Builder/Tester

t/lib/Test/Builder/Tester.pm

t/lib/Test/Builder/Tester/Color.pm

t/lib/Test/Exception.pm

t/lib/Test/Harness

t/lib/Test/Harness.pm

t/lib/Test/Harness/Assert.pm

t/lib/Test/Harness/Iterator.pm

t/lib/Test/Harness/Point.pm

t/lib/Test/Harness/Results.pm

t/lib/Test/Harness/Straps.pm

t/lib/Test/Harness/TAP.pod

t/lib/Test/Harness/Util.pm

t/lib/Test/More.pm

t/lib/Test/Simple.pm

t/lib/Test/Tutorial.pod

t/lib/Test/Warn.pm

t/lib/Tree

t/lib/Tree/DAG_Node.pm

files modified:
.pc/applied-patches

.pc/fix_examples_perl_location/examples/db/getGenBank.pl

Bio/Align/Utilities.pm

Bio/AlignIO.pm

Bio/AlignIO/fasta.pm

Bio/AlignIO/largemultifasta.pm

Bio/AlignIO/meme.pm

Bio/AlignIO/phylip.pm

Bio/AlignIO/stockholm.pm

Bio/Assembly/IO.pm

Bio/Assembly/IO/ace.pm

Bio/Assembly/IO/bowtie.pm

Bio/Assembly/IO/sam.pm

Bio/Assembly/Tools/ContigSpectrum.pm

Bio/Cluster/SequenceFamily.pm

Bio/ClusterIO.pm

Bio/DB/Ace.pm

Bio/DB/CUTG.pm

Bio/DB/Fasta.pm

Bio/DB/Flat/BinarySearch.pm

Bio/DB/GenBank.pm

Bio/DB/HIV/HIVQueryHelper.pm

Bio/DB/NCBIHelper.pm

Bio/DB/Qual.pm

Bio/DB/Query/HIVQuery.pm

Bio/DB/Query/WebQuery.pm

Bio/DB/RandomAccessI.pm

Bio/DB/SeqFeature/Store.pm

Bio/DB/SeqFeature/Store/DBI/Pg.pm

Bio/DB/SeqFeature/Store/DBI/SQLite.pm

Bio/DB/SeqFeature/Store/DBI/mysql.pm

Bio/DB/SeqFeature/Store/LoadHelper.pm

Bio/DB/SeqFeature/Store/berkeleydb.pm

Bio/DB/SeqFeature/Store/berkeleydb3.pm

Bio/DB/SeqFeature/Store/memory.pm

Bio/DB/SeqI.pm

Bio/DB/SeqVersion/gi.pm

Bio/DB/SwissProt.pm

Bio/DB/TFBS/transfac_pro.pm

Bio/DB/Taxonomy.pm

Bio/DB/Taxonomy/entrez.pm

Bio/DB/Taxonomy/flatfile.pm

Bio/DB/Taxonomy/list.pm *

Bio/DB/WebDBSeqI.pm

Bio/FeatureHolderI.pm

Bio/Index/Blast.pm

Bio/Index/Hmmer.pm

Bio/Index/Qual.pm

Bio/Index/Stockholm.pm

Bio/LiveSeq/SeqI.pm

Bio/LocatableSeq.pm

Bio/Location/Simple.pm

Bio/Map/Mappable.pm

Bio/MapIO.pm

Bio/Matrix/IO.pm

Bio/Nexml/Factory.pm

Bio/Ontology/SimpleGOEngine/GraphAdaptor.pm

Bio/OntologyIO.pm

Bio/Perl.pm

Bio/PhyloNetwork.pm

Bio/PopGen/Genotype.pm

Bio/PopGen/IO.pm

Bio/PopGen/IO/hapmap.pm

Bio/PopGen/IO/phase.pm

Bio/PopGen/Individual.pm

Bio/PopGen/Marker.pm

Bio/PopGen/Population.pm

Bio/PopGen/Statistics.pm

Bio/PopGen/Utilities.pm

Bio/PrimarySeq.pm

Bio/PrimarySeqI.pm

Bio/Restriction/IO.pm

Bio/Root/Build.pm

Bio/Root/IO.pm

Bio/Root/Root.pm

Bio/Root/Test.pm

Bio/Root/Utilities.pm

Bio/Root/Version.pm

Bio/Search/HSP/GenericHSP.pm

Bio/Search/HSP/HMMERHSP.pm

Bio/Search/HSP/HSPI.pm

Bio/Search/Hit/ModelHit.pm

Bio/SearchIO.pm

Bio/SearchIO/blast.pm

Bio/SearchIO/blasttable.pm

Bio/SearchIO/blastxml.pm

Bio/SearchIO/cross_match.pm

Bio/SearchIO/exonerate.pm

Bio/SearchIO/hmmer.pm

Bio/SearchIO/hmmer2.pm

Bio/SearchIO/hmmer3.pm

Bio/SearchIO/psl.pm

Bio/Seq.pm

Bio/Seq/EncodedSeq.pm

Bio/Seq/Meta.pm

Bio/Seq/Meta/Array.pm

Bio/Seq/PrimaryQual.pm

Bio/Seq/PrimedSeq.pm

Bio/Seq/SeqFactory.pm

Bio/Seq/SeqFastaSpeedFactory.pm

Bio/SeqFeature/AnnotationAdaptor.pm

Bio/SeqFeature/Computation.pm

Bio/SeqFeature/FeaturePair.pm

Bio/SeqFeature/Generic.pm

Bio/SeqFeature/Lite.pm

Bio/SeqFeature/PositionProxy.pm

Bio/SeqFeature/Primer.pm

Bio/SeqFeature/SiRNA/Pair.pm

Bio/SeqFeature/Similarity.pm

Bio/SeqFeature/Tools/Unflattener.pm

Bio/SeqI.pm

Bio/SeqIO.pm

Bio/SeqIO/FTHelper.pm

Bio/SeqIO/agave.pm

Bio/SeqIO/bsml.pm

Bio/SeqIO/chadoxml.pm

Bio/SeqIO/embl.pm

Bio/SeqIO/entrezgene.pm

Bio/SeqIO/fasta.pm

Bio/SeqIO/fastq.pm

Bio/SeqIO/game/gameWriter.pm

Bio/SeqIO/gbxml.pm

Bio/SeqIO/genbank.pm

Bio/SeqIO/interpro.pm

Bio/SeqIO/msout.pm

Bio/SeqIO/phd.pm

Bio/SeqIO/qual.pm

Bio/SeqIO/raw.pm

Bio/SeqIO/scf.pm

Bio/SeqIO/strider.pm

Bio/SeqIO/tab.pm

Bio/SeqUtils.pm

Bio/SimpleAlign.pm

Bio/Structure/IO.pm

Bio/Taxon.pm

Bio/Tools/Alignment/Consed.pm

Bio/Tools/Analysis/DNA/ESEfinder.pm

Bio/Tools/Analysis/Protein/ELM.pm

Bio/Tools/Analysis/Protein/NetPhos.pm

Bio/Tools/GFF.pm

Bio/Tools/Gel.pm

Bio/Tools/Geneid.pm

Bio/Tools/GuessSeqFormat.pm

Bio/Tools/IUPAC.pm

Bio/Tools/Match.pm

Bio/Tools/Phylo/Gerp.pm

Bio/Tools/Phylo/PAML.pm

Bio/Tools/Phylo/PAML/Codeml.pm

Bio/Tools/Primer3.pm

Bio/Tools/Protparam.pm

Bio/Tools/Run/StandAloneBlast.pm

Bio/Tools/Run/WrapperBase.pm

Bio/Tools/SeqPattern.pm

Bio/Tree/Node.pm

Bio/Tree/Statistics.pm

Bio/Tree/Tree.pm

Bio/Tree/TreeFunctionsI.pm

Bio/TreeIO.pm

Bio/TreeIO/newick.pm

Bio/TreeIO/svggraph.pm

Bio/Variation/IO.pm

Bio/Variation/IO/xml.pm

Build.PL *

Changes

INSTALL

MANIFEST

META.json

META.yml

README

debian/changelog

debian/compat

debian/control

debian/copyright

debian/patches/series

examples/db/getGenBank.pl

t/Align/AlignUtil.t

t/Align/SimpleAlign.t

t/AlignIO/AlignIO.t

t/AlignIO/fasta.t

t/AlignIO/meme.t

t/AlignIO/phylip.t

t/AlignIO/stockholm.t

t/Annotation/Annotation.t

t/Annotation/AnnotationAdaptor.t

t/Assembly/ContigSpectrum.t

t/Assembly/IO/bowtie.t

t/Assembly/IO/sam.t

t/Assembly/core.t

t/ClusterIO/SequenceFamily.t

t/LocalDB/Flat.t

t/LocalDB/Index/Index.t

t/LocalDB/SeqFeature.t

t/Map/Map.t

t/Map/Physical.t

t/Ontology/IO/go.t

t/Ontology/IO/obo.t

t/PopGen/MK.t

t/PopGen/PopGen.t

t/RemoteDB/BioFetch.t

t/RemoteDB/CUTG.t

t/RemoteDB/EMBL.t

t/RemoteDB/GenBank.t

t/RemoteDB/GenPept.t

t/RemoteDB/HIV/HIVQuery.t

t/RemoteDB/SeqVersion.t

t/RemoteDB/SwissProt.t

t/RemoteDB/Taxonomy.t

t/Root/RootIO.t

t/SearchIO/Tiling.t *

t/SearchIO/blast.t

t/SearchIO/blasttable.t

t/SearchIO/exonerate.t

t/SearchIO/hmmer.t

t/Seq/EncodedSeq.t

t/Seq/LocatableSeq.t

t/Seq/MetaSeq.t

t/Seq/PrimaryQual.t

t/Seq/PrimarySeq.t

t/Seq/PrimedSeq.t

t/Seq/Quality.t

t/Seq/Seq.t

t/SeqFeature/Primer.t

t/SeqFeature/Unflattener.t

t/SeqIO/SeqBuilder.t

t/SeqIO/SeqIO.t

t/SeqIO/Splicedseq.t

t/SeqIO/embl.t

t/SeqIO/fasta.t

t/SeqIO/fastq.t

t/SeqIO/genbank.t

t/SeqIO/msout.t

t/SeqTools/CodonTable.t

t/SeqTools/SeqUtils.t

t/Tools/Analysis/Protein/Domcut.t

t/Tools/Analysis/Protein/HNN.t

t/Tools/GFF.t

t/Tools/Genewise.t

t/Tools/Genomewise.t

t/Tools/IUPAC.t

t/Tools/Match.t

t/Tools/Phylo/Gerp.t

t/Tools/Phylo/PAML.t

t/Tools/Primer3.t

t/Tools/Run/RemoteBlast.t

t/Tools/Run/WBCommandExts.t

t/Tools/Run/WrapperBase.t

t/Tree/Tree.t

t/Tree/TreeStatistics.t

t/data/bootstrap.tre

t/data/msout/msout_infile1

t/data/msout/msout_infile2

t/data/msout/msout_infile3

t/data/msout/msout_infile4

t/data/primedseq.fa

t/data/taxdump/names.dmp

t/data/taxdump/nodes.dmp

t/data/testaln.stockholm

t/data/traits.tab

Show diffs side-by-side

added added

removed removed

scripts/Bio-DB-GFF/bp_genbank2gff3.pl

#!/usr/bin/perl

=pod

=head1 NAME

bp_genbank2gff3.pl -- Genbank-E<gt>gbrowse-friendly GFF3

=head1 SYNOPSIS

bp_genbank2gff3.pl [options] filename(s)

# process a directory containing GenBank flatfiles

perl bp_genbank2gff3.pl --dir path_to_files --zip

# process a single file, ignore explicit exons and introns

perl bp_genbank2gff3.pl --filter exon --filter intron file.gbk.gz

# process a list of files

perl bp_genbank2gff3.pl *gbk.gz

# process data from URL, with Chado GFF model (-noCDS), and pipe to database loader

curl ftp://ftp.ncbi.nih.gov/genomes/Saccharomyces_cerevisiae/CHR_X/NC_001142.gbk \

| perl bp_genbank2gff3.pl -noCDS -in stdin -out stdout \

| perl gmod_bulk_load_gff3.pl -dbname mychado -organism fromdata

Options:

--noinfer -r don't infer exon/mRNA subfeatures

--conf -i path to the curation configuration file that contains user preferences

for Genbank entries (must be YAML format)

(if --manual is passed without --ini, user will be prompted to

create the file if any manual input is saved)

--sofile -l path to to the so.obo file to use for feature type mapping

(--sofile live will download the latest online revision)

--manual -m when trying to guess the proper SO term, if more than

one option matches the primary tag, the converter will

wait for user input to choose the correct one

(only works with --sofile)

--dir -d path to a list of genbank flatfiles

--outdir -o location to write GFF files (can be 'stdout' or '-' for pipe)

--zip -z compress GFF3 output files with gzip

--summary -s print a summary of the features in each contig

--filter -x genbank feature type(s) to ignore

--split -y split output to separate GFF and fasta files for

each genbank record

--nolump -n separate file for each reference sequence

(default is to lump all records together into one

output file for each input file)

--ethresh -e error threshold for unflattener

set this high (>2) to ignore all unflattener errors

--[no]CDS -c Keep CDS-exons, or convert to alternate gene-RNA-protein-exon

model. --CDS is default. Use --CDS to keep default GFF gene model,

use --noCDS to convert to g-r-p-e.

--format -f Input format (SeqIO types): GenBank, Swiss or Uniprot, EMBL work

(GenBank is default)

--GFF_VERSION 3 is default, 2 and 2.5 and other Bio::Tools::GFF versions available

--quiet don't talk about what is being processed

--typesource SO sequence type for source (e.g. chromosome; region; contig)

--help -h display this message

=head1 DESCRIPTION

This script uses Bio::SeqFeature::Tools::Unflattener and

Bio::Tools::GFF to convert GenBank flatfiles to GFF3 with gene

containment hierarchies mapped for optimal display in gbrowse.

The input files are assumed to be gzipped GenBank flatfiles for refseq

contigs. The files may contain multiple GenBank records. Either a

single file or an entire directory can be processed. By default, the

DNA sequence is embedded in the GFF but it can be saved into separate

fasta file with the --split(-y) option.

If an input file contains multiple records, the default behaviour is

to dump all GFF and sequence to a file of the same name (with .gff

appended). Using the 'nolump' option will create a separate file for

each genbank record. Using the 'split' option will create separate

GFF and Fasta files for each genbank record.

=head2 Notes

=head3 'split' and 'nolump' produce many files

In cases where the input files contain many GenBank records (for

example, the chromosome files for the mouse genome build), a very

large number of output files will be produced if the 'split' or

'nolump' options are selected. If you do have lists of files E<gt> 6000,

use the --long_list option in bp_bulk_load_gff.pl or

bp_fast_load_gff.pl to load the gff and/ or fasta files.

=head3 Designed for RefSeq

This script is designed for RefSeq genomic sequence entries. It may

work for third party annotations but this has not been tested.

But see below, Uniprot/Swissprot works, EMBL and possibly EMBL/Ensembl

if you don't mind some gene model unflattener errors (dgg).

100

101

=head3 G-R-P-E Gene Model

102

103

Don Gilbert worked this over with needs to produce GFF3 suited to

104

loading to GMOD Chado databases. Most of the changes I believe are

105

suited for general use. One main chado-specific addition is the

106

--[no]cds2protein flag

107

108

My favorite GFF is to set the above as ON by default (disable with --nocds2prot)

109

For general use it probably should be OFF, enabled with --cds2prot.

110

111

This writes GFF with an alternate, but useful Gene model,

112

instead of the consensus model for GFF3

113

114

[ gene > mRNA> (exon,CDS,UTR) ]

115

116

This alternate is

117

118

gene > mRNA > polypeptide > exon

119

120

means the only feature with dna bases is the exon. The others

121

specify only location ranges on a genome. Exon of course is a child

122

of mRNA and protein/peptide.

123

124

The protein/polypeptide feature is an important one, having all the

125

annotations of the GenBank CDS feature, protein ID, translation, GO

126

terms, Dbxrefs to other proteins.

127

128

UTRs, introns, CDS-exons are all inferred from the primary exon bases

129

inside/outside appropriate higher feature ranges. Other special gene

130

model features remain the same.

131

132

Several other improvements and bugfixes, minor but useful are included

133

134

* IO pipes now work:

135

curl ftp://ncbigenomes/... | bp_genbank2gff3 --in stdin --out stdout | gff2chado ...

136

137

* GenBank main record fields are added to source feature, e.g. organism, date,

138

and the sourcetype, commonly chromosome for genomes, is used.

139

140

* Gene Model handling for ncRNA, pseudogenes are added.

141

142

* GFF header is cleaner, more informative.

143

--GFF_VERSION flag allows choice of v2 as well as default v3

144

145

* GFF ##FASTA inclusion is improved, and

146

CDS translation sequence is moved to FASTA records.

147

148

* FT -> GFF attribute mapping is improved.

149

150

* --format choice of SeqIO input formats (GenBank default).

151

Uniprot/Swissprot and EMBL work and produce useful GFF.

152

153

* SeqFeature::Tools::TypeMapper has a few FT -> SOFA additions

154

and more flexible usage.

155

156

=head1 TODO

157

158

=head2 Are these additions desired?

159

160

* filter input records by taxon (e.g. keep only organism=xxx or taxa level = classYYY

161

* handle Entrezgene, other non-sequence SeqIO structures (really should change

162

those parsers to produce consistent annotation tags).

163

164

=head2 Related bugfixes/tests

165

166

These items from Bioperl mail were tested (sample data generating

167

errors), and found corrected:

168

169

From: Ed Green <green <at> eva.mpg.de>

170

Subject: genbank2gff3.pl on new human RefSeq

171

Date: 2006-03-13 21:22:26 GMT

172

-- unspecified errors (sample data works now).

173

174

From: Eric Just <e-just <at> northwestern.edu>

175

Subject: genbank2gff3.pl

176

Date: 2007-01-26 17:08:49 GMT

177

-- bug fixed in genbank2gff3 for multi-record handling

178

179

This error is for a /trans_splice gene that is hard to handle, and

180

unflattner/genbank2 doesn't

181

182

From: Chad Matsalla <chad <at> dieselwurks.com>

183

Subject: genbank2gff3.PLS and the unflatenner - Inconsistent order?

184

Date: 2005-07-15 19:51:48 GMT

185

186

=head1 AUTHOR

187

188

Sheldon McKay (mckays@cshl.edu)

189

190

191

192

=head2 AUTHOR of hacks for GFF2Chado loading

193

194

Don Gilbert (gilbertd@indiana.edu)

195

196

197

=cut

198

199

use strict;

200

use warnings;

201

202

use lib "$ENV{HOME}/bioperl-live";

203

# chad put this here to enable situations when this script is tested

204

# against bioperl compiled into blib along with other programs using blib

205

BEGIN {

206

unshift(@INC,'blib/lib');

207

};

208

use Pod::Usage;

209

use Bio::Root::RootI;

210

use Bio::SeqIO;

211

use File::Spec;

212

use Bio::SeqFeature::Tools::Unflattener;

213

use Bio::SeqFeature::Tools::TypeMapper;

214

use Bio::SeqFeature::Tools::IDHandler;

215

use Bio::Location::SplitLocationI;

216

use Bio::Location::Simple;

217

use Bio::Tools::GFF;

218

use Getopt::Long;

219

use List::Util qw(first);

220

use Bio::OntologyIO;

221

use YAML qw(Dump LoadFile DumpFile);

222

use File::Basename;

223

224

use vars qw/$split @filter $zip $outdir $help $ethresh

225

$ONTOLOGY %FEATURES %DESCENDANTS @RETURN $MANUAL @GFF_LINE_FEAT

226

$CONF $YAML $TYPE_MAP $SYN_MAP $noinfer $SO_FILE

227

$file @files $dir $summary $nolump

228

$source_type %proteinfa %exonpar $didheader $verbose $DEBUG $GFF_VERSION

229

$gene_id $rna_id $tnum $ncrna_id $rnum %method %id %seen/;

230

231

use constant SO_URL =>

232

'http://song.cvs.sourceforge.net/viewvc/*checkout*/song/ontology/so.obo';

233

use constant ALPHABET => [qw(a b c d e f g h i j k l m n o p q r s t u v w x y z)];

234

use constant ALPHABET_TO_NUMBER => {

235

a => 0, b => 1, c => 2, d => 3, e => 4, f => 5, g => 6, h => 7, i => 8,

236

j => 9, k => 10, l => 11, m => 12, n => 13, o => 14, p => 15, q => 16,

237

r => 17, s => 18, t => 19, u => 20, v => 21, w => 22, x => 23, y => 24,

238

z => 25,

239

};

240

use constant ALPHABET_DIVISOR => 26;

241

use constant GM_NEW_TOPLEVEL => 2;

242

use constant GM_NEW_PART => 1;

243

use constant GM_DUP_PART => 0;

244

use constant GM_NOT_PART => -1;

245

246

# Options cycle in multiples of 2 because of left side/right side pairing.

247

# You can make this number odd, but displayed matches will still round up

248

use constant OPTION_CYCLE => 6;

249

250

$GFF_VERSION = 3; # allow v2 ...

251

$verbose = 1; # right default? -nov to turn off

252

253

# dgg: change the gene model to Gene/mRNA/Polypeptide/exons...

254

my $CDSkeep= 1; # default should be ON (prior behavior), see gene_features()

255

my $PROTEIN_TYPE = 'polypeptide'; # for noCDSkeep;

256

# protein = flybase chado usage; GMOD Perls use 'polypeptide' with software support

257

258

my $FORMAT="GenBank"; # swiss ; embl; genbank ; ** guess from SOURCEID **

259

my $SOURCEID= $FORMAT; # "UniProt" "GenBank" "EMBL" should work

260

# other Bio::SeqIO formats may work. TEST: EntrezGene < problematic tags; InterPro KEGG

261

262

263

my %TAG_MAP = (

264

db_xref => 'Dbxref',

265

name => 'Name',

266

note => 'Note', # also pull GO: ids into Ontology_term

267

synonym => 'Alias',

268

symbol => 'Alias', # is symbol still used?

269

# protein_id => 'Dbxref', also seen Dbxref tags: EC_number

270

# translation: handled in gene_features

271

);

272

273

274

$| = 1;

275

my $quiet= !$verbose;

276

my $ok= GetOptions( 'd|dir|input:s' => \$dir,

277

'z|zip' => \$zip,

278

'h|help' => \$help,

279

's|summary' => \$summary,

280

'r|noinfer' => \$noinfer,

281

'i|conf=s' => \$CONF,

282

'sofile=s' => \$SO_FILE,

283

'm|manual' => \$MANUAL,

284

'o|outdir|output:s'=> \$outdir,

285

'x|filter:s'=> \@filter,

286

'y|split' => \$split,

287

"ethresh|e=s"=>\$ethresh,

288

'c|CDS!' => \$CDSkeep,

289

'f|format=s' => \$FORMAT,

290

'typesource=s' => \$source_type,

291

'GFF_VERSION=s' => \$GFF_VERSION,

292

'quiet!' => \$quiet, # swap quiet to verbose

293

'DEBUG!' => \$DEBUG,

294

'n|nolump' => \$nolump);

295

296

my $lump = 1 unless $nolump || $split;

297

$verbose= !$quiet;

298

299

# look for help request

300

pod2usage(2) if $help || !$ok;

301

302

# keep SOURCEID as-is and change FORMAT for SeqIO types;

303

# note SeqIO uses file.suffix to guess type; not useful here

304

$SOURCEID= $FORMAT;

305

$FORMAT = "swiss" if $FORMAT =~/UniProt|trembl/;

306

$verbose =1 if($DEBUG);

307

308

# initialize handlers

309

my $unflattener = Bio::SeqFeature::Tools::Unflattener->new; # for ensembl genomes (-trust_grouptag=>1);

310

$unflattener->error_threshold($ethresh) if $ethresh;

311

$unflattener->verbose(1) if($DEBUG);

312

# $unflattener->group_tag('gene') if($FORMAT =~ /embl/i) ; #? ensembl only?

313

# ensembl parsing is still problematic, forget this

314

315

my $tm = Bio::SeqFeature::Tools::TypeMapper->new;

316

my $idh = Bio::SeqFeature::Tools::IDHandler->new;

317

318

# dgg

319

$source_type ||= "region"; # should really parse from FT.source contents below

320

321

#my $FTSOmap = $tm->FT_SO_map();

322

my $FTSOmap;

323

my $FTSOsynonyms;

324

325

if (defined($SO_FILE) && $SO_FILE eq 'live') {

326

print "\nDownloading the latest SO file from ".SO_URL."\n\n";

327

use LWP::UserAgent;

328

my $ua = LWP::UserAgent->new(timeout => 30);

329

my $request = HTTP::Request->new(GET => SO_URL);

330

my $response = $ua->request($request);

331

332

333

if ($response->status_line =~ /200/) {

334

use File::Temp qw/ tempfile /;

335

my ($fh, $fn) = tempfile();

336

print $fh $response->content;

337

$SO_FILE = $fn;

338

} else {

339

print "Couldn't download SO file online...skipping validation.\n"

340

. "HTTP Status was " . $response->status_line . "\n"

341

and undef $SO_FILE

342

}

343

}

344

345

if ($SO_FILE) {

346

347

348

my (%terms, %syn);

349

350

my $parser = Bio::OntologyIO->new( -format => "obo", -file => $SO_FILE );

351

$ONTOLOGY = $parser->next_ontology();

352

353

for ($ONTOLOGY->get_all_terms) {

354

my $feat = $_;

355

356

$terms{$feat->name} = $feat->name;

357

#$terms{$feat->name} = $feat;

358

359

my @syn = $_->each_synonym;

360

361

push @{$syn{$_}}, $feat->name for @syn;

362

#push @{$syn{$_}}, $feat for @syn;

363

}

364

365

$FTSOmap = \%terms;

366

$FTSOsynonyms = \%syn;

367

368

my %hardTerms = %{ $tm->FT_SO_map() };

369

map { $FTSOmap->{$_} ||= $hardTerms{$_} } keys %hardTerms;

370

371

} else {

372

my %terms = %{ $tm->FT_SO_map() };

373

while (my ($k,$v) = each %terms) {

374

$FTSOmap->{$k} = ref($v) ? shift @$v : $v;

375

}

376

}

377

378

$TYPE_MAP = $FTSOmap;

379

$SYN_MAP = $FTSOsynonyms;

380

381

382

# #convert $FTSOmap undefined to valid SO : moved to TypeMapper->map_types( -undefined => "region")

383

384

# stringify filter list if applicable

385

my $filter = join ' ', @filter if @filter;

386

387

# determine input files

388

my $stdin=0; # dgg: let dir == stdin == '-' for pipe use

389

if ($dir && ($dir eq '-' || $dir eq 'stdin')) {

390

$stdin=1; $dir=''; @files=('stdin');

391

392

} elsif ( $dir ) {

393

if ( -d $dir ) {

394

opendir DIR, $dir or die "could not open $dir for reading: $!";

395

@files = map { "$dir/$_";} grep { /\.gb.*/ } readdir DIR;

396

closedir DIR;

397

}

398

else {

399

die "$dir is not a directory\n";

400

}

401

}

402

else {

403

@files = @ARGV;

404

$dir = '';

405

}

406

407

# we should have some files by now

408

pod2usage(2) unless @files;

409

410

411

my $stdout=0; # dgg: let outdir == stdout == '-' for pipe use

412

if($outdir && ($outdir eq '-' || $outdir eq 'stdout')) {

413

warn("std. output chosen: cannot split\n") if($split);

414

warn("std. output chosen: cannot zip\n") if($zip);

415

warn("std. output chosen: cannot nolump\n") if($nolump);

416

$stdout=1; $lump=1; $split= 0; $zip= 0; # unless we pipe stdout thru gzip

417

418

} elsif ( $outdir && !-e $outdir ) {

419

mkdir($outdir) or die "could not create directory $outdir: $!\n";

420

}

421

elsif ( !$outdir ) {

422

$outdir = $dir || '.';

423

}

424

425

for my $file ( @files ) {

426

# dgg ; allow 'stdin' / '-' input ?

427

chomp $file;

428

die "$! $file" unless($stdin || -e $file);

429

print "# Input: $file\n" if($verbose);

430

431

my ($lump_fh, $lumpfa_fh, $outfile, $outfa);

432

if ($stdout) {

433

$lump_fh= *STDOUT; $lump="stdout$$";

434

$outfa= "stdout$$.fa"; # this is a temp file ... see below

435

open $lumpfa_fh, ">$outfa" or die "Could not create a lump outfile called ($outfa) because ($!)\n";

436

437

} elsif ( $lump ) {

438

my ($vol,$dirs,$fileonly) = File::Spec->splitpath($file);

439

$lump = File::Spec->catfile($outdir, $fileonly.'.gff');

440

($outfa = $lump) =~ s/\.gff/\.fa/;

441

open $lump_fh, ">$lump" or die "Could not create a lump outfile called ($lump) because ($!)\n";

442

open $lumpfa_fh, ">$outfa" or die "Could not create a lump outfile called ($outfa) because ($!)\n";

443

444

}

445

446

# open input file, unzip if req'd

447

if ($stdin) {

448

*FH= *STDIN;

449

} elsif ( $file =~ /\.gz/ ) {

450

open FH, "gunzip -c $file |";

451

}

452

else {

453

open FH, "<$file";

454

}

455

456

my $in = Bio::SeqIO->new(-fh => \*FH, -format => $FORMAT, -debug=>$DEBUG);

457

my $gffio = Bio::Tools::GFF->new( -noparse => 1, -gff_version => $GFF_VERSION );

458

459

while ( my $seq = $in->next_seq() ) {

460

my $seq_name = $seq->accession_number;

461

my $end = $seq->length;

462

my @to_print;

463

464

# arrange disposition of GFF output

465

$outfile = $lump || File::Spec->catfile($outdir, $seq_name.'.gff');

466

my $out;

467

468

if ( $lump ) {

469

$outfile = $lump;

470

$out = $lump_fh;

471

}

472

else {

473

$outfile = File::Spec->catfile($outdir, $seq_name.'.gff');

474

open $out, ">$outfile";

475

}

476

477

# filter out unwanted features

478

my $source_feat= undef;

479

my @source= filter($seq); $source_feat= $source[0];

480

481

($source_type,$source_feat)=

482

getSourceInfo( $seq, $source_type, $source_feat ) ;

483

# always; here we build main prot $source_feat; # if @source;

484

485

# abort if there are no features

486

warn "$seq_name has no features, skipping\n" and next

487

if !$seq->all_SeqFeatures;

488

489

490

$FTSOmap->{'source'} = $source_type;

491

## $FTSOmap->{'CDS'}= $PROTEIN_TYPE; # handle this in gene_features

492

493

# construct a GFF header

494

# add: get source_type from attributes of source feature? chromosome=X tag

495

# also combine 1st ft line here with source ft from $seq ..

496

my($header,$info)= gff_header($seq_name, $end, $source_type, $source_feat);

497

print $out $header;

498

print "# working on $info\n" if($verbose);

499

500

# unflatten gene graphs, apply SO types, etc; this also does TypeMapper ..

501

unflatten_seq($seq);

502

503

# Note that we use our own get_all_SeqFeatures function

504

# to rescue cloned exons

505

506

@GFF_LINE_FEAT = ();

507

for my $feature ( get_all_SeqFeatures($seq) ) {

508

509

my $method = $feature->primary_tag;

510

next if($SOURCEID =~/UniProt|swiss|trembl/i && $method ne $source_type);

511

512

$feature->seq_id($seq->id) unless($feature->seq_id);

513

$feature->source_tag($SOURCEID);

514

515

# dgg; need to convert some Genbank to GFF tags: note->Note; db_xref->Dbxref;

516

## also, pull any GO:000 ids from /note tag and put into Ontology_term

517

maptags2gff($feature);

518

519

# current gene name. The unflattened gene features should be in order so any

520

# exons, CDSs, etc that follow will belong to this gene

521

my $gene_name;

522

if ( $method eq 'gene' || $method eq 'pseudogene' ) {

523

@to_print= print_held($out, $gffio, \@to_print);

524

$gene_id = $gene_name= gene_name($feature);

525

} else {

526

$gene_name= gene_name($feature);

527

}

528

529

#?? should gene_name from /locus_tag,/gene,/product,/transposon=xxx

530

# be converted to or added as Name=xxx (if not ID= or as well)

531

## problematic: convert_to_name ($feature); # drops /locus_tag,/gene, tags

532

convert_to_name($feature);

533

534

## dgg: extended to protein|polypeptide

535

## this test ($feature->has_tag('gene') ||) is not good: repeat_regions over genes

536

## in yeast have that genbank tag; why?

537

## these include pseudogene ...

538

539

## Note we also have mapped types to SO, so these RNA's are now transcripts:

540

# pseudomRNA => "pseudogenic_transcript",

541

# pseudotranscript" => "pseudogenic_transcript",

542

# misc_RNA=>'processed_transcript',

543

544

warn "#at: $method $gene_id/$gene_name\n" if $DEBUG;

545

546

547

|| ( $gene_id && $gene_name eq $gene_id ) ) {

548

549

my $action = gene_features($feature, $gene_id, $gene_name); # -1, 0, 1, 2 result

550

if ($action == GM_DUP_PART) {

551

# ignore, this is dupl. exon with new parent ...

552

553

} elsif ($action == GM_NOT_PART) {

554

add_generic_id( $feature, $gene_name, "nocount");

555

my $gff = $gffio->gff_string($feature);

556

push @GFF_LINE_FEAT, $feature;

557

#print $out "$gff\n" if $gff;

558

559

} elsif ($action > 0) {

560

# hold off print because exon etc. may get 2nd, 3rd parents

561

@to_print= print_held($out, $gffio, \@to_print) if ($action == GM_NEW_TOPLEVEL);

562

push(@to_print, $feature);

563

}

564

565

}

566

567

# otherwise handle as generic feats with IDHandler labels

568

else {

569

add_generic_id( $feature, $gene_name, "");

570

my $gff= $gffio->gff_string($feature);

571

push @GFF_LINE_FEAT, $feature;

572

#print $out "$gff\n" if $gff;

573

}

574

}

575

576

# don't like doing this after others; do after each new gene id?

577

@to_print= print_held($out, $gffio, \@to_print);

578

579

gff_validate(@GFF_LINE_FEAT);

580

581

for my $feature (@GFF_LINE_FEAT) {

582

my $gff= $gffio->gff_string($feature);

583

print $out "$gff\n" if $gff;

584

}

585

586

# deal with the corresponding DNA

587

my ($fa_out,$fa_outfile);

588

my $dna = $seq->seq;

589

if($dna || %proteinfa) {

590

$method{'RESIDUES'} += length($dna);

591

$dna =~ s/(\S{60})/$1\n/g;

592

$dna .= "\n";

593

594

if ($split) {

595

$fa_outfile = $outfile;

596

$fa_outfile =~ s/gff$/fa/;

597

open $fa_out, ">$fa_outfile" or die $!;

598

print $fa_out ">$seq_name\n$dna" if $dna;

599

foreach my $aid (sort keys %proteinfa) {

600

my $aa= delete $proteinfa{$aid};

601

$method{'RESIDUES(tr)'} += length($aa);

602

$aa =~ s/(\S{60})/$1\n/g;

603

print $fa_out ">$aid\n$aa\n";

604

}

605

606

}

607

else {

608

## problem here when multiple GB Seqs in one file; all FASTA needs to go at end of $out

609

## see e.g. Mouse: mm_ref_chr19.gbk has NT_082868 and NT_039687 parts in one .gbk

610

## maybe write this to temp .fa then cat to end of lumped gff $out

611

print $lumpfa_fh ">$seq_name\n$dna" if $dna;

612

foreach my $aid (sort keys %proteinfa) {

613

my $aa= delete $proteinfa{$aid};

614

$method{'RESIDUES(tr)'} += length($aa);

615

$aa =~ s/(\S{60})/$1\n/g;

616

print $lumpfa_fh ">$aid\n$aa\n";

617

}

618

}

619

620

%proteinfa=();

621

}

622

623

if ( $zip && !$lump ) {

624

system "gzip -f $outfile";

625

system "gzip -f $fa_outfile" if($fa_outfile);

626

$outfile .= '.gz';

627

$fa_outfile .= '.gz' if $split;

628

}

629

630

# print "\n>EOF\n" if($stdout); #?? need this if summary goes to stdout after FASTA

631

print "# GFF3 saved to $outfile" unless( !$verbose || $stdout || $lump);

632

print ($split ? "; DNA saved to $fa_outfile\n" : "\n") unless($stdout|| $lump);

633

634

# dgg: moved to after all inputs; here it prints cumulative sum for each record

635

#if ( $summary ) {

636

# print "# Summary:\n# Feature\tCount\n# -------\t-----\n";

637

638

# for ( keys %method ) {

639

# print "# $_ $method{$_}\n";

640

# }

641

# print "# \n";

642

# }

643

644

}

645

646

print "# GFF3 saved to $outfile\n" if( $verbose && $lump);

647

if ( $summary ) {

648

print "# Summary:\n# Feature\tCount\n# -------\t-----\n";

649

for ( keys %method ) {

650

print "# $_ $method{$_}\n";

651

}

652

print "# \n";

653

}

654

655

## FIXME for piped output w/ split FA files ...

656

close($lumpfa_fh) if $lumpfa_fh;

657

if (!$split && $outfa && $lump_fh) {

658

print $lump_fh "##FASTA\n"; # GFF3 spec

659

open $lumpfa_fh, $outfa or warn "reading FA $outfa: $!";

660

while( <$lumpfa_fh>) {

661

print $lump_fh $_;

662

} # is $lump_fh still open?

663

close($lumpfa_fh);

664

unlink($outfa);

665

}

666

667

668

if ( $zip && $lump ) {

669

system "gzip -f $lump";

670

}

671

672

close FH;

673

}

674

675

676

677

678

679

sub typeorder {

680

return 1 if ($_[0] =~ /gene/);

681

return 2 if ($_[0] =~ /RNA|transcript/);

682

return 3 if ($_[0] =~ /protein|peptide/);

683

return 4 if ($_[0] =~ /exon|CDS/);

684

return 3; # default before exon (smallest part)

685

}

686

687

sub sort_by_feattype {

688

my($at,$bt)= ($a->primary_tag, $b->primary_tag);

689

return (typeorder($at) <=> typeorder($bt))

690

or ($at cmp $bt);

691

## or ($a->name() cmp $b->name());

692

}

693

694

sub print_held {

695

my($out,$gffio,$to_print)= @_;

696

return unless(@$to_print);

697

@$to_print = sort sort_by_feattype @$to_print; # put exons after mRNA, otherwise chado loader chokes

698

while ( my $feature = shift @$to_print) {

699

my $gff= $gffio->gff_string($feature); # $gff =~ s/\'/./g; # dang bug in encode

700

push @GFF_LINE_FEAT, $feature;

701

#print $out "$gff\n";

702

}

703

return (); # @to_print

704

}

705

706

sub maptags2gff {

707

my $f = shift;

708

## should copy/move locus_tag to Alias, if not ID/Name/Alias already

709

# but see below /gene /locus_tag usage

710

foreach my $tag (keys %TAG_MAP) {

711

if ($f->has_tag($tag)) {

712

my $newtag= $TAG_MAP{$tag};

713

my @v= $f->get_tag_values($tag);

714

$f->remove_tag($tag);

715

$f->add_tag_value($newtag,@v);

716

717

## also, pull any GO:000 ids from /note tag and put into Ontology_term

718

## ncbi syntax in CDS /note is now '[goid GO:0005886]' OR '[goid 0005624]'

719

if ($tag eq 'note') {

720

map { s/\[goid (\d+)/\[goid GO:$1/g; } @v;

721

my @go= map { m/(GO:\d+)/g } @v;

722

$f->add_tag_value('Ontology_term',@go) if(@go);

723

}

724

725

}

726

}

727

}

728

729

730

sub getSourceInfo {

731

my ($seq, $source_type, $sf) = @_;

732

733

my $is_swiss= ($SOURCEID =~/UniProt|swiss|trembl/i);

734

my $is_gene = ($SOURCEID =~/entrezgene/i);

735

my $is_rich = (ref($seq) =~ /RichSeq/);

736

my $seq_name= $seq->accession_number();

737

738

unless($sf) { # make one

739

$source_type= $is_swiss ? $PROTEIN_TYPE

740

: $is_gene ? "eneg" # "gene" # "region" #

741

: $is_rich ? $seq->molecule : $source_type;

742

$sf = Bio::SeqFeature::Generic->direct_new();

743

744

my $len = $seq->length(); $len=1 if($len<1); my $start = 1; ##$start= $len if ($len<1);

745

my $loc= $seq->can('location') ? $seq->location()

746

: new Bio::Location::Simple( -start => $start, -end => $len);

747

$sf->location( $loc );

748

$sf->primary_tag($source_type);

749

$sf->source_tag($SOURCEID);

750

$sf->seq_id( $seq_name);

751

#? $sf->display_name($seq->id()); ## Name or Alias ?

752

$sf->add_tag_value( Alias => $seq->id()); # unless id == accession

753

$seq->add_SeqFeature($sf);

754

## $source_feat= $sf;

755

}

756

757

if ($sf->has_tag("chromosome")) {

758

$source_type= "chromosome";

759

my ($chrname) = $sf->get_tag_values("chromosome");

760

## PROBLEM with Name <> ID, RefName for Gbrowse; use Alias instead

761

## e.g. Mouse chr 19 has two IDs in NCBI genbank now

762

$sf->add_tag_value( Alias => $chrname );

763

}

764

765

# pull GB Comment, Description for source ft ...

766

# add reference - can be long, not plain string...

767

warn "# $SOURCEID:$seq_name fields = ", join(",", $seq->annotation->get_all_annotation_keys()),"\n" if $DEBUG;

768

# GenBank fields: keyword,comment,reference,date_changed

769

# Entrezgene fields 850293 =ALIAS_SYMBOL,RefSeq status,chromosome,SGD,dblink,Entrez Gene Status,OntologyTerm,LOCUS_SYNONYM

770

771

# is this just for main $seq object or for all seqfeatures ?

772

my %AnnotTagMap= (

773

'gene_name' => 'Alias',

774

'ALIAS_SYMBOL' => 'Alias', # Entrezgene

775

'LOCUS_SYNONYM' => 'Alias', #?

776

'symbol' => 'Alias',

777

'synonym' => 'Alias',

778

'dblink' => 'Dbxref',

779

'product' => 'product',

780

'Reference' => 'reference',

781

'OntologyTerm' => 'Ontology_term',

782

'comment' => 'Note',

783

'comment1' => 'Note',

784

# various map-type locations

785

# gene accession tag is named per source db !??

786

# 'Index terms' => keywords ??

787

);

788

789

790

my ($desc)= $seq->annotation->get_Annotations("desc") || ( $seq->desc() );

791

my ($date)= $seq->annotation->get_Annotations("dates")

792

|| $seq->annotation->get_Annotations("update-date")

793

|| $is_rich ? $seq->get_dates() : ();

794

my ($comment)= $seq->annotation->get_Annotations("comment");

795

my ($species)= $seq->annotation->get_Annotations("species");

796

if (!$species

797

&& $seq->can('species')

798

&& defined $seq->species()

799

&& $seq->species()->can('binomial') ) {

800

$species= $seq->species()->binomial();

801

}

802

803

# update source feature with main GB fields

804

$sf->add_tag_value( ID => $seq_name ) unless $sf->has_tag('ID');

805

$sf->add_tag_value( Note => $desc ) if($desc && ! $sf->has_tag('Note'));

806

$sf->add_tag_value( organism => $species ) if($species && ! $sf->has_tag('organism'));

807

$sf->add_tag_value( comment1 => $comment ) if(!$is_swiss && $comment && ! $sf->has_tag('comment1'));

808

$sf->add_tag_value( date => $date ) if($date && ! $sf->has_tag('date'));

809

810

$sf->add_tag_value( Dbxref => $SOURCEID.':'.$seq_name ) if $is_swiss || $is_gene;

811

812

foreach my $atag (sort keys %AnnotTagMap) {

813

my $gtag= $AnnotTagMap{$atag}; next unless($gtag);

814

my @anno = map{

815

if (ref $_ && $_->can('get_all_values')) {

816

split( /[,;] */, join ";", $_->get_all_values)

817

}

818

elsif (ref $_ && $_->can('display_text')) {

819

split( /[,;] */, $_->display_text)

820

}

821

elsif (ref $_ && $_->can('value')) {

822

split( /[,;] */, $_->value)

823

}

824

else {

825

();

826

}

827

} $seq->annotation->get_Annotations($atag);

828

foreach(@anno) { $sf->add_tag_value( $gtag => $_ ); }

829

}

830

831

#my @genes = map{ split( /[,;] */, "$_"); } $seq->annotation->get_Annotations('gene_name');

832

#$sf->add_tag_value( Alias => $_ ) foreach(@genes);

833

834

#my @dblink= map { "$_"; } $seq->annotation->get_Annotations("dblink"); # add @all

835

#$sf->add_tag_value( Dbxref => $_ ) foreach(@dblink);

836

837

return (wantarray)? ($source_type,$sf) : $source_type; #?

838

}

839

840

841

sub gene_features {

842

my ($f, $gene_id, $genelinkID) = @_;

843

local $_ = $f->primary_tag;

844

$method{$_}++;

845

846

if ( /gene/ ) {

847

$f->add_tag_value( ID => $gene_id ) unless($f->has_tag('ID')); # check is same value!?

848

$tnum = $rnum= 0; $ncrna_id= $rna_id = '';

849

return GM_NEW_TOPLEVEL;

850

851

} elsif ( /mRNA/ ) {

852

return GM_NOT_PART unless $gene_id;

853

return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);

854

($rna_id = $gene_id ) =~ s/gene/mRNA/;

855

$rna_id .= '.t0' . ++$tnum;

856

$f->add_tag_value( ID => $rna_id );

857

$f->add_tag_value( Parent => $gene_id );

858

859

} elsif ( /RNA|transcript/) {

860

## misc_RNA here; missing exons ... flattener problem?

861

# all of {t,nc,sn}RNA can have gene models now

862

## but problem in Worm chr: mRNA > misc_RNA > CDS with same locus tag

863

## CDS needs to use mRNA, not misc_RNA, rna_id ...

864

## also need to fix cases where tRNA,... lack a 'gene' parent: make this one top-level

865

866

if($gene_id) {

867

return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);

868

($ncrna_id = $gene_id) =~ s/gene/ncRNA/;

869

$ncrna_id .= '.r0' . ++$rnum;

870

$f->add_tag_value( Parent => $gene_id );

871

$f->add_tag_value( ID => $ncrna_id );

872

} else {

873

unless ($f->has_tag('ID')) {

874

if($genelinkID) {

875

$f->add_tag_value( ID => $genelinkID ) ;

876

} else {

877

$idh->generate_unique_persistent_id($f);

878

}

879

}

880

($ncrna_id)= $f->get_tag_values('ID');

881

return GM_NEW_TOPLEVEL;

882

# this feat now acts as gene-top-level; need to print @to_print to flush prior exons?

883

}

884

885

} elsif ( /exon/ ) { # can belong to any kind of RNA

886

return GM_NOT_PART unless ($rna_id||$ncrna_id);

887

return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);

888

## we are getting duplicate Parents here, which chokes chado loader, with reason...

889

## problem is when mRNA and ncRNA have same exons, both ids are active, called twice

890

## check all Parents

891

for my $expar ($rna_id, $ncrna_id) {

892

next unless($expar);

893

if ( $exonpar{$expar} and $f->has_tag('Parent') ) {

894

my @vals = $f->get_tag_values('Parent');

895

next if (grep {$expar eq $_} @vals);

896

}

897

$exonpar{$expar}++;

898

$f->add_tag_value( Parent => $expar);

899

# last; #? could be both

900

}

901

# now we can skip cloned exons

902

# dgg note: multiple parents get added and printed for each unique exon

903

return GM_DUP_PART if ++$seen{$f} > 1;

904

905

} elsif ( /CDS|protein|polypeptide/ ) {

906

return GM_NOT_PART unless $rna_id; ## ignore $ncrna_id ??

907

return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id); #??

908

(my $pro_id = $rna_id) =~ s/\.t/\.p/;

909

910

if( ! $CDSkeep && /CDS/) {

911

$f->primary_tag($PROTEIN_TYPE);

912

913

## duplicate problem is Location ..

914

if ($f->location->isa("Bio::Location::SplitLocationI")) {

915

# my($b,$e)=($f->start, $f->end); # is this all we need?

916

my($b,$e)=(-1,0);

917

foreach my $l ($f->location->each_Location) {

918

$b = $l->start if($b<0 || $b > $l->start);

919

$e = $l->end if($e < $l->end);

920

}

921

$f->location( Bio::Location::Simple->new(

922

-start => $b, -end => $e, -strand => $f->strand) );

923

}

924

925

$f->add_tag_value( Derives_from => $rna_id );

926

}

927

else {

928

$f->add_tag_value( Parent => $rna_id );

929

}

930

931

$f->add_tag_value( ID => $pro_id );

932

933

move_translation_fasta($f, $pro_id);

934

#if( $f->has_tag('translation')) {

935

# my ($aa) = $f->get_tag_values("translation");

936

# $proteinfa{$pro_id}= $aa;

937

# $f->remove_tag("translation");

938

# $f->add_tag_value("translation","length.".length($aa)); # hack for odd chado gbl problem

939

940

} elsif ( /region/ ) {

941

$f->primary_tag('gene_component_region');

942

$f->add_tag_value( Parent => $gene_id );

943

} else {

944

return GM_NOT_PART unless $gene_id;

945

$f->add_tag_value( Parent => $gene_id );

946

}

947

948

## return GM_DUP_PART if /exon/ && ++$seen{$f} > 1;

949

950

return GM_NEW_PART;

951

}

952

953

## was generic_features > add_generic_id

954

sub add_generic_id {

955

my ($f, $ft_name, $flags) = @_;

956

my $method = $f->primary_tag;

957

$method{$method}++ unless($flags =~ /nocount/); ## double counts GM_NOT_PART from above

958

959

if ($f->has_tag('ID')) {

960

961

}

962

elsif ( $f->has_tag($method) ) {

963

my ($name) = $f->get_tag_values($method);

964

$f->add_tag_value( ID => "$method:$name" );

965

}

966

elsif($ft_name) { # is this unique ?

967

$f->add_tag_value( ID => $ft_name );

968

}

969

else {

970

$idh->generate_unique_persistent_id($f);

971

}

972

973

move_translation_fasta( $f, ($f->get_tag_values("ID"))[0] )

974

if($method =~ /CDS/);

975

976

# return $io->gff_string($f);

977

}

978

979

sub move_translation_fasta {

980

my ($f, $ft_id) = @_;

981

if( $ft_id && $f->has_tag('translation') ) {

982

my ($aa) = $f->get_tag_values("translation");

983

if($aa && $aa !~ /^length/) {

984

$proteinfa{$ft_id}= $aa;

985

$f->remove_tag("translation");

986

$f->add_tag_value("translation","length.".length($aa)); # hack for odd chado gbl problem

987

}

988

}

989

}

990

991

sub gff_header {

992

my ($name, $end, $source_type, $source_feat) = @_;

993

$source_type ||= "region";

994

995

my $info = "$source_type:$name";

996

my $head = "##gff-version $GFF_VERSION\n".

997

"##sequence-region $name 1 $end\n".

998

"# conversion-by bp_genbank2gff3.pl\n";

999

if ($source_feat) {

1000

## dgg: these header comment fields are not useful when have multi-records, diff organisms

1001

for my $key (qw(organism Note date)) {

1002

my $value;

1003

if ($source_feat->has_tag($key)) {

1004

($value) = $source_feat->get_tag_values($key);

1005

}

1006

if ($value) {

1007

$head .= "# $key $value\n";

1008

$info .= ", $value";

1009

}

1010

}

1011

$head = "" if $didheader;

1012

} else {

1013

$head .= "$name\t$SOURCEID\t$source_type\t1\t$end\t.\t.\t.\tID=$name\n";

1014

}

1015

$didheader++;

1016

return (wantarray) ? ($head,$info) : $head;

1017

}

1018

1019

sub unflatten_seq {

1020

my $seq = shift;

1021

1022

## print "# working on $source_type:", $seq->accession, "\n";

1023

my $uh_oh = "Possible gene unflattening error with" . $seq->accession_number .

1024

": consult STDERR\n";

1025

1026

eval {

1027

$unflattener->unflatten_seq( -seq => $seq,

1028

-noinfer => $noinfer,

1029

-use_magic => 1 );

1030

};

1031

1032

# deal with unflattening errors

1033

if ( $@ ) {

1034

warn $seq->accession_number . " Unflattening error:\n";

1035

warn "Details: $@\n";

1036

print "# ".$uh_oh;

1037

}

1038

1039

return 0 if !$seq || !$seq->all_SeqFeatures;

1040

1041

# map feature types to the sequence ontology

1042

## $tm->map_types_to_SO( -seq => $seq );

1043

#$tm->map_types( -seq => $seq, -type_map => $FTSOmap, -undefined => "region" ); #dgg

1044

1045

map_types(

1046

$tm,

1047

-seq => $seq,

1048

-type_map => $FTSOmap,

1049

-syn_map => $FTSOsynonyms,

1050

-undefined => "region"

1051

); #nml

1052

1053

}

1054

1055

sub filter {

1056

my $seq = shift;

1057

## return unless $filter;

1058

my @feats;

1059

my @sources; # dgg; pick source features here; only 1 always?

1060

if ($filter) {

1061

for my $f ( $seq->remove_SeqFeatures ) {

1062

my $m = $f->primary_tag;

1063

push @sources, $f if ($m eq 'source'); # dgg? but leave in @feats ?

1064

push @feats, $f unless $filter =~ /$m/i;

1065

}

1066

$seq->add_SeqFeature($_) foreach @feats;

1067

} else {

1068

for my $f ( $seq->get_SeqFeatures ){

1069

my $m = $f->primary_tag;

1070

push @sources, $f if ($m eq 'source'); # dgg? but leave in @feats ?

1071

}

1072

}

1073

1074

return @sources;

1075

}

1076

1077

1078

# The default behaviour of Bio::FeatureHolderI:get_all_SeqFeatures

1079

# changed to filter out cloned features. We have to implement the old

1080

# method. These two subroutines were adapted from the v1.4 Bio::FeatureHolderI

1081

sub get_all_SeqFeatures {

1082

my $seq = shift;

1083

my @flatarr;

1084

1085

foreach my $feat ( $seq->get_SeqFeatures ){

1086

push(@flatarr,$feat);

1087

_add_flattened_SeqFeatures(\@flatarr,$feat);

1088

}

1089

return @flatarr;

1090

}

1091

1092

sub gene_name {

1093

my $g = shift;

1094

my $gene_id = ''; # zero it;

1095

1096

if ($g->has_tag('locus_tag')) {

1097

($gene_id) = $g->get_tag_values('locus_tag');

1098

}

1099

1100

elsif ($g->has_tag('gene')) {

1101

($gene_id) = $g->get_tag_values('gene');

1102

}

1103

elsif ($g->has_tag('ID')) { # for non-Genbank > Entrezgene

1104

($gene_id) = $g->get_tag_values('ID');

1105

}

1106

1107

## See Unflattener comment:

1108

# on rare occasions, records will have no /gene or /locus_tag

1109

# but it WILL have /product tags. These serve the same purpose

1110

# for grouping. For an example, see AY763288 (also in t/data)

1111

# eg. product=tRNA-Asp ; product=similar to crooked neck protein

1112

elsif ($g->has_tag('product')) {

1113

my ($name)= $g->get_tag_values('product');

1114

($gene_id) = $name unless($name =~ / /); # a description not name

1115

}

1116

1117

## dgg; also handle transposon=xxxx ID/name

1118

# ID=GenBank:repeat_region:NC_004353:1278337:1281302;transposon=HeT-A{}1685;Dbxref=FLYBASE:FBti0059746

1119

elsif ($g->has_tag('transposon')) {

1120

my ($name)= $g->get_tag_values('transposon');

1121

($gene_id) = $name unless($name =~ / /); # a description not name

1122

}

1123

1124

return $gene_id;

1125

}

1126

1127

# same list as gene_name .. change tag to generic Name

1128

sub convert_to_name {

1129

my $g = shift;

1130

my $gene_id = ''; # zero it;

1131

1132

if ($g->has_tag('gene')) {

1133

($gene_id) = $g->get_tag_values('gene');

1134

$g->remove_tag('gene');

1135

$g->add_tag_value('Name', $gene_id);

1136

}

1137

elsif ($g->has_tag('locus_tag')) {

1138

($gene_id) = $g->get_tag_values('locus_tag');

1139

$g->remove_tag('locus_tag');

1140

$g->add_tag_value('Name', $gene_id);

1141

}

1142

1143

elsif ($g->has_tag('product')) {

1144

my ($name)= $g->get_tag_values('product');

1145

($gene_id) = $name unless($name =~ / /); # a description not name

1146

## $g->remove_tag('product');

1147

$g->add_tag_value('Name', $gene_id);

1148

}

1149

1150

elsif ($g->has_tag('transposon')) {

1151

my ($name)= $g->get_tag_values('transposon');

1152

($gene_id) = $name unless($name =~ / /); # a description not name

1153

## $g->remove_tag('transposon');

1154

$g->add_tag_value('Name', $gene_id);

1155

}

1156

elsif ($g->has_tag('ID')) {

1157

my ($name)= $g->get_tag_values('ID');

1158

$g->add_tag_value('Name', $name);

1159

}

1160

return $gene_id;

1161

}

1162

1163

1164

sub _add_flattened_SeqFeatures {

1165

my ($arrayref,$feat) = @_;

1166

my @subs = ();

1167

1168

if ($feat->isa("Bio::FeatureHolderI")) {

1169

@subs = $feat->get_SeqFeatures;

1170

}

1171

elsif ($feat->isa("Bio::SeqFeatureI")) {

1172

@subs = $feat->sub_SeqFeature;

1173

}

1174

else {

1175

warn ref($feat)." is neither a FeatureHolderI nor a SeqFeatureI. ".

1176

"Don't know how to flatten.";

1177

}

1178

1179

for my $sub (@subs) {

1180

push(@$arrayref,$sub);

1181

_add_flattened_SeqFeatures($arrayref,$sub);

1182

}

1183

1184

}

1185

1186

sub map_types {

1187

1188

my ($self, @args) = @_;

1189

1190

my($sf, $seq, $type_map, $syn_map, $undefmap) =

1191

$self->_rearrange([qw(FEATURE

1192

SEQ

1193

TYPE_MAP

1194

SYN_MAP

1195

UNDEFINED

1196

)],

1197

@args);

1198

1199

if (!$sf && !$seq) {

1200

$self->throw("you need to pass in either -feature or -seq");

1201

}

1202

1203

my @sfs = ($sf);

1204

if ($seq) {

1205

$seq->isa("Bio::SeqI") || $self->throw("$seq NOT A SeqI");

1206

@sfs = $seq->get_all_SeqFeatures;

1207

}

1208

$type_map = $type_map || $self->typemap; # dgg: was type_map;

1209

foreach my $feat (@sfs) {

1210

1211

$feat->isa("Bio::SeqFeatureI") || $self->throw("$feat NOT A SeqFeatureI");

1212

$feat->isa("Bio::FeatureHolderI") || $self->throw("$feat NOT A FeatureHolderI");

1213

1214

my $primary_tag = $feat->primary_tag;

1215

1216

#if ($primary_tag =~ /^pseudo(.*)$/) {

1217

# $primary_tag = $1;

1218

# $feat->primary_tag($primary_tag);

1219

1220

1221

my $mtype = $type_map->{$primary_tag};

1222

if ($mtype) {

1223

if (ref($mtype)) {

1224

if (ref($mtype) eq 'ARRAY') {

1225

my $soID;

1226

($mtype, $soID) = @$mtype;

1227

1228

if ($soID && ref($ONTOLOGY)) {

1229

my ($term) = $ONTOLOGY->find_terms(-identifier => $soID);

1230

$mtype = $term->name if $term;

1231

}

1232

# if SO ID is undefined AND we have an ontology to search, we want to delete

1233

# the feature type hash entry in order to force a fuzzy search

1234

elsif (! defined $soID && ref($ONTOLOGY)) {

1235

undef $mtype;

1236

delete $type_map->{$primary_tag};

1237

}

1238

elsif ($undefmap && $mtype eq 'undefined') { # dgg

1239

$mtype= $undefmap;

1240

}

1241

1242

$type_map->{$primary_tag} = $mtype if $mtype;

1243

}

1244

elsif (ref($mtype) eq 'CODE') {

1245

$mtype = $mtype->($feat);

1246

}

1247

else {

1248

$self->throw('must be scalar or CODE ref');

1249

}

1250

}

1251

elsif ($undefmap && $mtype eq 'undefined') { # dgg

1252

$mtype= $undefmap;

1253

}

1254

$feat->primary_tag($mtype);

1255

}

1256

1257

if ($CONF) {

1258

conf_read();

1259

my %perfect_matches;

1260

while (my ($p_tag,$rules) = each %$YAML) {

1261

RULE:

1262

for my $rule (@$rules) {

1263

for my $tags (@$rule) {

1264

while (my ($tag,$values) = each %$tags) {

1265

for my $value (@$values) {

1266

if ($feat->has_tag($tag)) {

1267

for ($feat->get_tag_values($tag)) {

1268

next RULE unless $_ =~ /\Q$value\E/;

1269

}

1270

} elsif ($tag eq 'primary_tag') {

1271

next RULE unless $value eq

1272

$feat->primary_tag;

1273

} elsif ($tag eq 'location') {

1274

next RULE unless $value eq

1275

$feat->start.'..'.$feat->end;

1276

} else { next RULE }

1277

}

1278

}

1279

}

1280

$perfect_matches{$p_tag}++;

1281

}

1282

}

1283

if (scalar(keys %perfect_matches) == 1) {

1284

$mtype = $_ for keys %perfect_matches;

1285

} elsif (scalar(keys %perfect_matches) > 1) {

1286

warn "There are conflicting rules in the config file for the" .

1287

" following types: ";

1288

warn "\t$_\n" for keys %perfect_matches;

1289

warn "Until conflict resolution is built into the converter," .

1290

" you will have to manually edit the config file to remove the" .

1291

" conflict. Sorry :(. Skipping user preference for this entry";

1292

sleep(2);

1293

}

1294

}

1295

1296

if ( ! $mtype && $syn_map) {

1297

if ($feat->has_tag('note')) {

1298

1299

my @all_matches;

1300

1301

my @note = $feat->each_tag_value('note');

1302

1303

for my $k (keys %$syn_map) {

1304

1305

if ($k =~ /"(.+)"/) {

1306

1307

my $syn = $1;

1308

1309

for my $note (@note) {

1310

1311

# look through the notes to see if the description

1312

# is an exact match for synonyms

1313

if ( $syn eq $note ) {

1314

1315

my @map = @{$syn_map->{$k}};

1316

1317

1318

my $best_guess = $map[0];

1319

1320

unshift @{$all_matches[-1]}, [$best_guess];

1321

1322

$mtype = $MANUAL

1323

? manual_curation($feat, $best_guess, \@all_matches)

1324

: $best_guess;

1325

1326

print '#' x 78 . "\nGuessing the proper SO term for GenBank"

1327

. " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n"

1328

. '#' x 78 . "\n\n";

1329

1330

} else {

1331

# check both primary tag and and note against

1332

# SO synonyms for best matching description

1333

1334

SO_fuzzy_match( $k, $primary_tag, $note, $syn, \@all_matches);

1335

}

1336

1337

}

1338

}

1339

}

1340

1341

#unless ($mtype) {

1342

for my $note (@note) {

1343

for my $name (values %$type_map) {

1344

# check primary tag against SO names for best matching

1345

# descriptions //NML also need to check against

1346

# definition && camel case split terms

1347

1348

SO_fuzzy_match($name, $primary_tag, $note, $name, \@all_matches);

1349

}

1350

}

1351

1352

1353

if (scalar(@all_matches) && !$mtype) {

1354

1355

my $top_matches = first { defined $_ } @{$all_matches[-1]};

1356

1357

my $best_guess = $top_matches->[0];

1358

1359

1360

1361

# if guess has quotes, it is a synonym term. we need to

1362

# look up the corresponding name term

1363

# otherwise, guess is a name, so we can use it directly

1364

if ($best_guess =~ /"(.+)"/) {

1365

1366

$best_guess = $syn_map->{$best_guess}->[0];

1367

1368

}

1369

1370

@RETURN = @all_matches;

1371

$mtype = $MANUAL

1372

? manual_curation($feat, $best_guess, \@all_matches)

1373

: $best_guess;

1374

1375

print '#' x 78 . "\nGuessing the proper SO term for GenBank"

1376

. " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n"

1377

. '#' x 78 . "\n\n";

1378

1379

}

1380

}

1381

$mtype ||= $undefmap;

1382

$feat->primary_tag($mtype);

1383

}

1384

}

1385

1386

1387

}

1388

1389

sub SO_fuzzy_match {

1390

1391

my $candidate = shift;

1392

my $primary_tag = shift;

1393

my $note = shift;

1394

my $SO_terms = shift;

1395

my $best_matches_ref = shift;

1396

my $modifier = shift;

1397

1398

$modifier ||= '';

1399

1400

my @feat_terms;

1401

1402

for ( split(" |_", $primary_tag) ) {

1403

#my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g;

1404

my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g;

1405

push @feat_terms, @camelCase;

1406

}

1407

1408

for ( split(" |_", $note) ) {

1409

#my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g;

1410

#my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g;

1411

(my $word = $_) =~ s/[;:.,]//g;

1412

push @feat_terms, $word;

1413

}

1414

1415

1416

my @SO_terms = split(" |_", $SO_terms);

1417

1418

# fuzzy match works on a simple point system. When 2 words match,

1419

# the $plus counter adds one. When they don't, the $minus counter adds

1420

# one. This is used to sort similar matches together. Better matches

1421

# are found at the end of the array, near the top.

1422

1423

# NML: can we improve best match by using synonym tags

1424

# EXACT,RELATED,NARROW,BROAD?

1425

1426

my ($plus, $minus) = (0, 0);

1427

my %feat_terms;

1428

my %SO_terms;

1429

1430

#unique terms

1431

map {$feat_terms{$_} = 1} @feat_terms;

1432

map {$SO_terms{$_} = 1} @SO_terms;

1433

1434

for my $st (keys %SO_terms) {

1435

for my $ft (keys %feat_terms) {

1436

1437

($st =~ m/$modifier\Q$ft\E/) ? $plus++ : $minus++;

1438

1439

}

1440

}

1441

1442

push @{$$best_matches_ref[$plus][$minus]}, $candidate if $plus;

1443

1444

}

1445

1446

sub manual_curation {

1447

1448

my ($feat, $default_opt, $all_matches) = @_;

1449

1450

my @all_matches = @$all_matches;

1451

1452

# convert all SO synonyms into names and filter

1453

# all matches into unique term list because

1454

# synonyms can map to multiple duplicate names

1455

1456

my (@unique_SO_terms, %seen);

1457

for (reverse @all_matches) {

1458

for (@$_) {

1459

for (@$_) {

1460

#my @names;

1461

if ($_ =~ /"(.+)"/) {

1462

for (@{$SYN_MAP->{$_}}) {

1463

push @unique_SO_terms, $_ unless $seen{$_};

1464

$seen{$_}++;

1465

}

1466

} else {

1467

push @unique_SO_terms, $_ unless $seen{$_};

1468

$seen{$_}++;

1469

}

1470

}

1471

}

1472

}

1473

1474

my $s = scalar(@unique_SO_terms);

1475

1476

my $choice = 0;

1477

1478

my $more =

1479

"[a]uto : automatic input (selects best guess for remaining entries)\r" .

1480

"[f]ind : search for other SO terms matching your query (e.g. f gene)\r" .

1481

"[i]nput : add a specific term\r" .

1482

"[r]eset : reset to the beginning of matches\r" .

1483

"[s]kip : skip this entry (selects best guess for this entry)\r"

1484

;

1485

1486

$more .=

1487

"[n]ext : view the next ".OPTION_CYCLE." terms\r" .

1488

"[p]rev : view the previous ".OPTION_CYCLE." terms" if ($s > OPTION_CYCLE);

1489

1490

my $msg = #"\n\n" . '-' x 156 . "\n"

1491

"The converter found $s possible matches for the following GenBank entry: ";

1492

1493

my $directions =

1494

"Type a number to select the SO term that best matches"

1495

. " the genbank entry, or use any of the following options:\r" . '_' x 76 . "\r$more";

1496

1497

1498

# lookup filtered list to pull out definitions

1499

my @options = map {

1500

my $term = $_;

1501

my %term;

1502

for (['name', 'name'], ['def', 'definition'], ['synonym',

1503

'each_synonym']) {

1504

my ($label, $method) = @$_;

1505

$term{$label} = \@{[$term->$method]};

1506

}

1507

[++$choice, $_->name, ($_->definition || 'none'), \%term, $_->each_synonym ];

1508

} map { $ONTOLOGY->find_terms(-name => $_) } @unique_SO_terms;

1509

1510

1511

my $option = options_cycle(0, OPTION_CYCLE, $msg, $feat, $directions,

1512

$default_opt, @options);

1513

1514

if ($option eq 'skip') { return $default_opt

1515

} elsif ($option eq 'auto') {

1516

$MANUAL = 0;

1517

return $default_opt;

1518

} else { return $option }

1519

1520

}

1521

1522

sub options_cycle {

1523

1524

my ($start, $stop, $msg, $feat, $directions, $best_guess, @opt) = @_;

1525

1526

#NML: really should only call GenBank_entry once. Will need to change

1527

#method to return array & shift off header

1528

my $entry = GenBank_entry($feat, "\r");

1529

1530

my $total = scalar(@opt);

1531

1532

($start,$stop) = (0, OPTION_CYCLE)

1533

if ( ($start < 0) && ($stop > 0) );

1534

1535

($start,$stop) = (0, OPTION_CYCLE)

1536

if ( ( ($stop - $start) < OPTION_CYCLE ) && $stop < $total);

1537

1538

($start,$stop) = ($total - OPTION_CYCLE, $total) if $start < 0;

1539

($start,$stop) = (0, OPTION_CYCLE) if $start >= $total;

1540

1541

$stop = $total if $stop > $total;

1542

1543

my $dir_copy = $directions;

1544

my $msg_copy = $msg;

1545

my $format = "format STDOUT = \n" .

1546

'-' x 156 . "\n" .

1547

'^' . '<' x 77 . '| Available Commands:' . "\n" .

1548

'$msg_copy' . "\n" .

1549

'-' x 156 . "\n" .

1550

' ' x 78 . "|\n" .

1551

'^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" .

1552

'$entry' . ' ' x 74 . '$dir_copy,' . "\n" .

1553

(' ' x 20 . '^' . '<' x 57 . '| ^' . '<' x 75 . '~' . "\n" .

1554

' ' x 20 . '$entry,' . ' ' x 53 . '$dir_copy,' . "\n") x 1000 . ".\n";

1555

1556

{

1557

# eval throws redefined warning that breaks formatting.

1558

# Turning off warnings just for the eval to fix this.

1559

no warnings 'redefine';

1560

eval $format;

1561

}

1562

1563

write;

1564

1565

print '-' x 156 . "\n" .

1566

'Showing results ' . ( $stop ? ( $start + 1 ) : $start ) .

1567

" - $stop of possible SO term matches: (best guess is \"$best_guess\")" .

1568

"\n" . '-' x 156 . "\n";

1569

1570

for (my $i = $start; $i < $stop; $i+=2) {

1571

1572

my ($left, $right) = @opt[$i,$i+1];

1573

1574

my ($nL, $nmL, $descL, $termL, @synL) = @$left;

1575

1576

#odd numbered lists can cause fatal undefined errors, so check

1577

#to make sure we have data

1578

1579

my ($nR, $nmR, $descR, $termR, @synR) = ref($right) ? @$right : (undef, undef, undef);

1580

1581

1582

my $format = "format STDOUT = \n";

1583

1584

$format .=

1585

' ' x 78 . "|\n" .

1586

1587

'@>>: name: ^' . '<' x 64 . '~' . ' |' .

1588

( ref($right) ? ('@>>: name: ^' . '<' x 64 . '~' ) : '' ) . "\n" .

1589

'$nL,' . ' ' x 7 . '$nmL,' .

1590

( ref($right) ? (' ' x 63 . '$nR,' . ' ' x 7 . "\$nmR,") : '' ) . "\n" .

1591

1592

' ' x 11 . '^' . '<' x 61 . '...~' . ' |' .

1593

(ref($right) ? (' ^' . '<' x 61 . '...~') : '') . "\n" .

1594

' ' x 11 . '$nmL,' .

1595

(ref($right) ? (' ' x 74 . '$nmR,') : '') . "\n" .

1596

#' ' x 78 . '|' . "\n" .

1597

1598

1599

' def: ^' . '<' x 65 . ' |' .

1600

(ref($right) ? (' def: ^' . '<' x 64 . '~') : '') . "\n" .

1601

' ' x 11 . '$descL,' .

1602

(ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" .

1603

1604

1605

(' ^' . '<' x 65 . ' |' .

1606

(ref($right) ? (' ^' . '<' x 64 . '~') : '') . "\n" .

1607

' ' x 11 . '$descL,' .

1608

(ref($right) ? (' ' x 72 . '$descR,') : '') . "\n") x 5 .

1609

1610

1611

' ^' . '<' x 61 . '...~ |' .

1612

(ref($right) ? (' ^' . '<' x 61 . '...~') : '') . "\n" .

1613

' ' x 11 . '$descL,' .

1614

(ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" .

1615

1616

".\n";

1617

1618

{

1619

# eval throws redefined warning that breaks formatting.

1620

# Turning off warnings just for the eval to fix this.

1621

no warnings 'redefine';

1622

eval $format;

1623

}

1624

write;

1625

1626

}

1627

print '-' x 156 . "\nenter a command:";

1628

1629

while (<STDIN>) {

1630

1631

(my $input = $_) =~ s/\s+$//;

1632

1633

if ($input =~ /^\d+$/) {

1634

if ( $input && defined $opt[$input-1] ) {

1635

return $opt[$input-1]->[1]

1636

} else {

1637

print "\nThat number is not an option. Please enter a valid number.\n:";

1638

}

1639

} elsif ($input =~ /^n/i | $input =~ /next/i ) {

1640

return options_cycle($start + OPTION_CYCLE, $stop + OPTION_CYCLE, $msg,

1641

$feat, $directions, $best_guess, @opt)

1642

} elsif ($input =~ /^p/i | $input =~ /prev/i ) {

1643

return options_cycle($start - OPTION_CYCLE, $stop - OPTION_CYCLE, $msg,

1644

$feat, $directions, $best_guess, @opt)

1645

} elsif ( $input =~ /^s/i || $input =~ /skip/i ) { return 'skip'

1646

} elsif ( $input =~ /^a/i || $input =~ /auto/i ) { return 'auto'

1647

} elsif ( $input =~ /^r/i || $input =~ /reset/i ) {

1648

return manual_curation($feat, $best_guess, \@RETURN );

1649

} elsif ( $input =~ /^f/i || $input =~ /find/i ) {

1650

1651

my ($query, @query_results);

1652

1653

if ($input =~ /(?:^f|find)\s+?(.*?)$/) { $query = $1;

1654

} else {

1655

1656

#do a SO search

1657

print "Type your search query\n:";

1658

while (<STDIN>) { chomp($query = $_); last }

1659

}

1660

1661

for (keys(%$TYPE_MAP), keys(%$SYN_MAP)) {

1662

SO_fuzzy_match($_, $query, '', $_, \@query_results, '(?i)');

1663

}

1664

1665

return manual_curation($feat, $best_guess, \@query_results);

1666

1667

} elsif ( $input =~ /^i/i || $input =~ /input/i ) {

1668

1669

#NML fast input for later

1670

#my $query;

1671

#if ($input =~ /(?:^i|input)\s+?(.*?)$/) { $query = $1 };

1672

1673

#manual input

1674

print "Type the term you want to use\n:";

1675

while (<STDIN>) {

1676

chomp(my $input = $_);

1677

1678

if (! $TYPE_MAP->{$input}) {

1679

1680

print "\"$input\" doesn't appear to be a valid SO term. Are ".

1681

"you sure you want to use it? (y or n)\n:";

1682

1683

while (<STDIN>) {

1684

1685

chomp(my $choice = $_);

1686

1687

if ($choice eq 'y') {

1688

1689

"\nWould you like to save your preference for " .

1690

"future use (so you don't have to redo manual " .

1691

"curation for this feature everytime you run " .

1692

"the converter)? (y or n)\n";

1693

1694

#NML: all these while loops are a mess. Really should condense it.

1695

while (<STDIN>) {

1696

1697

chomp(my $choice = $_);

1698

1699

if ($choice eq 'y') {

1700

curation_save($feat, $input);

1701

return $input;

1702

} elsif ($choice eq 'n') {

1703

return $input

1704

} else {

1705

print "\nDidn't recognize that command. Please " .

1706

"type y or n.\n:"

1707

}

1708

}

1709

1710

1711

} elsif ($choice eq 'n') {

1712

return options_cycle($start, $stop, $msg, $feat,

1713

$directions, $best_guess, @opt)

1714

} else {

1715

print "\nDidn't recognize that command. Please " .

1716

"type y or n.\n:"

1717

}

1718

}

1719

1720

} else {

1721

1722

"\nWould you like to save your preference for " .

1723

"future use (so you don't have to redo manual " .

1724

"curation for this feature everytime you run " .

1725

"the converter)? (y or n)\n";

1726

1727

#NML: all these while loops are a mess. Really should condense it.

1728

while (<STDIN>) {

1729

1730

chomp(my $choice = $_);

1731

1732

if ($choice eq 'y') {

1733

curation_save($feat, $input);

1734

return $input;

1735

} elsif ($choice eq 'n') {

1736

return $input

1737

} else {

1738

print "\nDidn't recognize that command. Please " .

1739

"type y or n.\n:"

1740

}

1741

}

1742

1743

}

1744

1745

}

1746

} else {

1747

print "\nDidn't recognize that command. Please re-enter your choice.\n:"

1748

}

1749

}

1750

1751

}

1752

1753

sub GenBank_entry {

1754

my ($f, $delimiter, $num) = @_;

1755

1756

$delimiter ||= "\n";

1757

1758

1759

my $entry =

1760

1761

($num ? ' [1] ' : ' ' x 5) . $f->primary_tag

1762

. ($num

1763

? ' ' x (12 - length $f->primary_tag ) . ' [2] '

1764

: ' ' x (15 - length $f->primary_tag)

1765

)

1766

. $f->start.'..'.$f->end

1767

1768

. "$delimiter";

1769

1770

if ($num) {

1771

words_tag($f, \$entry);

1772

} else {

1773

for my $tag ($f->all_tags) {

1774

for my $val ( $f->each_tag_value($tag) ) {

1775

$entry .= ' ' x 20;

1776

#$entry .= "/$tag=\"$val\"$delimiter";

1777

$entry .= $val eq '_no_value'

1778

? "/$tag$delimiter"

1779

: "/$tag=\"$val\"$delimiter";

1780

}

1781

}

1782

1783

}

1784

1785

return $entry;

1786

}

1787

1788

1789

sub gff_validate {

1790

warn "Validating GFF file\n" if $DEBUG;

1791

my @feat = @_;

1792

1793

my (%parent2child, %all_ids, %descendants, %reserved);

1794

1795

for my $f (@feat) {

1796

for my $aTags (['Parent', \%parent2child], ['ID', \%all_ids]) {

1797

map { push @{$$aTags[1]->{$_}}, $f } $f->get_tag_values($$aTags[0])

1798

if $f->has_tag($$aTags[0]);

1799

}

1800

}

1801

1802

if ($SO_FILE) {

1803

while (my ($parentID, $aChildren) = each %parent2child) {

1804

parent_validate($parentID, $aChildren, \%all_ids, \%descendants, \%reserved);

1805

}

1806

}

1807

1808

id_validate(\%all_ids, \%reserved);

1809

}

1810

1811

sub parent_validate {

1812

my ($parentID, $aChildren, $hAll, $hDescendants, $hReserved) = @_;

1813

1814

my $aParents = $hAll->{$parentID};

1815

1816

map {

1817

my $child = $_;

1818

$child->add_tag_value( validation_error =>

1819

"feature tried to add Parent tag, but no Parent found with ID $parentID"

1820

);

1821

my %parents;

1822

map { $parents{$_} = 1 } $child->get_tag_values('Parent');

1823

delete $parents{$parentID};

1824

my @parents = keys %parents;

1825

1826

$child->remove_tag('Parent');

1827

1828

unless ($child->has_tag('ID')) {

1829

my $id = gene_name($child);

1830

$child->add_tag_value('ID', $id);

1831

push @{$hAll->{$id}}, $child

1832

}

1833

1834

$child->add_tag_value('Parent', @parents) if @parents;

1835

1836

} @$aChildren and return unless scalar(@$aParents);

1837

1838

my $par = join(',', map { $_->primary_tag } @$aParents);

1839

warn scalar(@$aParents)." POSSIBLE PARENT(S): $par" if $DEBUG;

1840

1841

#NML manual curation needs to happen here

1842

1843

1844

my %parentsToRemove;

1845

1846

CHILD:

1847

for my $child (@$aChildren) {

1848

my $childType = $child->primary_tag;

1849

1850

warn "WORKING ON $childType at ".$child->start.' to '.$child->end

1851

if $DEBUG;

1852

1853

for (my $i = 0; $i < scalar(@$aParents); $i++) {

1854

my $parent = $aParents->[$i];

1855

my $parentType = $parent->primary_tag;

1856

1857

warn "CHECKING $childType against $parentType" if $DEBUG;

1858

1859

#cache descendants so we don't have to do repeat searches

1860

unless ($hDescendants->{$parentType}) {

1861

1862

for my $term ($ONTOLOGY->find_terms(

1863

-name => $parentType

1864

) ) {

1865

1866

map {

1867

$hDescendants->{$parentType}{$_->name}++

1868

} $ONTOLOGY->get_descendant_terms($term);

1869

1870

}

1871

1872

# NML: hopefully temporary fix.

1873

# SO doesn't consider exon/CDS to be a child of mRNA

1874

# even though common knowledge dictates that they are

1875

# This cheat fixes the false positives for now

1876

if ($parentType eq 'mRNA') {

1877

$hDescendants->{$parentType}{'exon'} = 1;

1878

$hDescendants->{$parentType}{'CDS'} = 1;

1879

}

1880

1881

}

1882

1883

warn "\tCAN $childType at " . $child->start . ' to ' . $child->end .

1884

" be a child of $parentType?" if $DEBUG;

1885

if ($hDescendants->{$parentType}{$childType}) {

1886

warn "\tYES, $childType can be a child of $parentType" if $DEBUG;

1887

1888

#NML need to deal with multiple children matched to multiple different

1889

#parents. This model only assumes the first parent id that matches a child will

1890

#be the reserved feature.

1891

1892

$hReserved->{$parentID}{$parent}{'parent'} = $parent;

1893

push @{$hReserved->{$parentID}{$parent}{'children'}}, $child;

1894

1895

#mark parent for later removal from all IDs

1896

#so we don't accidentally change any parents

1897

1898

$parentsToRemove{$i}++;

1899

1900

next CHILD;

1901

}

1902

}

1903

1904

1905

1906

#NML shouldn't have to check this; somehow child can lose Parent

1907

#it's happening W3110

1908

#need to track this down

1909

if ( $child->has_tag('Parent') ) {

1910

1911

warn "\tNO, @{[$child->primary_tag]} cannot be a child of $parentID"

1912

if $DEBUG;

1913

1914

my %parents;

1915

1916

map { $parents{$_} = 1 } $child->get_tag_values('Parent');

1917

1918

delete $parents{$parentID};

1919

my @parents = keys %parents;

1920

1921

warn 'VALIDATION ERROR '.$child->primary_tag." at ".$child->start .

1922

' to ' . $child->end . " cannot be a child of ID $parentID"

1923

if $DEBUG;

1924

1925

$child->add_tag_value( validation_error =>

1926

"feature cannot be a child of $parentID");

1927

1928

$child->remove_tag('Parent');

1929

1930

unless ($child->has_tag('ID')) {

1931

my $id = gene_name($child);

1932

$child->add_tag_value('ID', $id);

1933

push @{$hAll->{$id}}, $child

1934

}

1935

1936

$child->add_tag_value('Parent', @parents) if @parents;

1937

}

1938

1939

}

1940

1941

#delete $aParents->[$_] for keys %parentsToRemove;

1942

splice(@$aParents, $_, 1) for keys %parentsToRemove;

1943

}

1944

1945

sub id_validate {

1946

my ($hAll, $hReserved) = @_;

1947

1948

1949

for my $id (keys %$hAll) {

1950

1951

#since 1 feature can have this id,

1952

#let's just shift it off and uniquify

1953

#the rest (unless it's reserved)

1954

1955

shift @{$hAll->{$id}} unless $hReserved->{$id};

1956

for my $feat (@{$hAll->{$id}}) {

1957

id_uniquify(0, $id, $feat, $hAll);

1958

}

1959

}

1960

1961

for my $parentID (keys %$hReserved) {

1962

1963

my @keys = keys %{$hReserved->{$parentID}};

1964

1965

shift @keys;

1966

1967

for my $k (@keys) {

1968

my $parent = $hReserved->{$parentID}{$k}{'parent'};

1969

my $aChildren= $hReserved->{$parentID}{$k}{'children'};

1970

1971

my $value = id_uniquify(0, $parentID, $parent, $hAll);

1972

for my $child (@$aChildren) {

1973

1974

my %parents;

1975

map { $parents{$_}++ } $child->get_tag_values('Parent');

1976

$child->remove_tag('Parent');

1977

delete $parents{$parentID};

1978

$parents{$value}++;

1979

my @parents = keys %parents;

1980

$child->add_tag_value('Parent', @parents);

1981

}

1982

1983

}

1984

}

1985

}

1986

1987

sub id_uniquify {

1988

my ($count, $value, $feat, $hAll) = @_;

1989

1990

warn "UNIQUIFYING $value" if $DEBUG;

1991

1992

if (! $count) {

1993

$feat->add_tag_value(Alias => $value);

1994

$value .= ('.' . $feat->primary_tag)

1995

} elsif ($count == 1) {

1996

$value .= ".$count"

1997

} else {

1998

chop $value;

1999

$value .= $count

2000

}

2001

$count++;

2002

2003

warn "ENDED UP WITH $value" if $DEBUG;

2004

if ( $hAll->{$value} ) {

2005

warn "$value IS ALREADY TAKEN" if $DEBUG;

2006

id_uniquify($count, $value, $feat, $hAll);

2007

} else {

2008

#warn "something's breaking ".$feat->primary_tag.' at '.$feat->start.' to '.$feat->end;

2009

$feat->remove_tag('ID');

2010

$feat->add_tag_value('ID', $value);

2011

push @{$hAll->{$value}}, $value;

2012

}

2013

2014

$value;

2015

}

2016

2017

sub conf_read {

2018

2019

print "\nCannot read $CONF. Change file permissions and retry, " .

2020

"or enter another file\n" and conf_locate() unless -r $CONF;

2021

2022

print "\nCannot write $CONF. Change file permissions and retry, " .

2023

"or enter another file\n" and conf_locate() unless -w $CONF;

2024

2025

$YAML = LoadFile($CONF);

2026

2027

}

2028

2029

sub conf_create {

2030

2031

my ($path, $input) = @_;

2032

2033

print "Cannot write to $path. Change directory permissions and retry " .

2034

"or enter another save path\n" and conf_locate() unless -w $path;

2035

2036

$CONF = $input;

2037

2038

open(FH, '>', $CONF);

2039

close(FH);

2040

conf_read();

2041

2042

2043

}

2044

2045

sub conf_write { DumpFile($CONF, $YAML) }

2046

2047

sub conf_locate {

2048

2049

print "\nEnter the location of a previously saved config, or a new " .

2050

"path and file name to create a new config (this step is " .

2051

"necessary to save any preferences)";

2052

2053

print "\n\nenter a command:";

2054

2055

while (<STDIN>) {

2056

chomp(my $input = $_);

2057

my ($fn, $path, $suffix) = fileparse($input, qr/\.[^.]*/);

2058

2059

if (-e $input && (! -d $input)) {

2060

2061

print "\nReading $input...\n";

2062

$CONF = $input;

2063

2064

conf_read();

2065

last;

2066

2067

} elsif (! -d $input && $fn.$suffix) {

2068

2069

print "Creating $input...\n";

2070

conf_create($path, $input);

2071

last;

2072

2073

} elsif (-e $input && -d $input) {

2074

print "You only entered a directory. " .

2075

"Please enter BOTH a directory and filename\n";

2076

} else {

2077

print "$input does not appear to be a valid path. Please enter a " .

2078

"valid directory and filename\n";

2079

}

2080

print "\nenter a command:";

2081

}

2082

}

2083

2084

sub curation_save {

2085

2086

my ($feat, $input) = @_;

2087

2088

#my $error = "Enter the location of a previously saved config, or a new " .

2089

# "path and file name to create a new config (this step is " .

2090

# "necessary to save any preferences)\n";

2091

2092

if (!$CONF) {

2093

print "\n\n";

2094

conf_locate();

2095

} elsif (! -e $CONF) {

2096

print "\n\nThe config file you have chosen doesn't exist.\n";

2097

conf_locate();

2098

} else { conf_read() }

2099

2100

my $entry = GenBank_entry($feat, "\r", 1);

2101

2102

my $msg = "Term entered: $input";

2103

my $directions = "Please select any/all tags that provide evidence for the term you

2104

have entered. You may enter multiple tags by separating them by commas/dashes

2105

(e.g 1,3,5-7). For tags with more than one word value (i.e 'note'), you have

2106

the option of either selecting the entire note as evidence, or specific

2107

keywords. If a tag has multiple keywords, they will be tagged alphabetically

2108

for selection. To select a specific keyword in a tag field, you must enter the

2109

tag number followed by the keyword letter (e.g 3a). Multiple keywords may be

2110

selected by entering each letter separated by commas/dashes (e.g 3b,f,4a-c). The more tags you select, the more specific the GenBank entry will have

2111

to be to match your curation. To match the GenBank entry exactly as it

2112

appears, type every number (start-end), or just type 'all'. Remember, once the converter saves your

2113

preference, you will no longer be prompted to choose a feature type for any

2114

matching entries until you edit the curation.ini file.";

2115

my $msg_copy = $msg;

2116

my $dir_copy = $directions;

2117

2118

my $format = "format STDOUT = \n" .

2119

'-' x 156 . "\n" .

2120

'^' . '<' x 77 . '| Directions:' . "\n" .

2121

'$msg_copy' . "\n" .

2122

'-' x 156 . "\n" .

2123

' ' x 78 . "|\n" .

2124

'^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" .

2125

'$entry' . ' ' x 74 . '$dir_copy,' . "\n" .

2126

(' ' x 15 . '^' . '<' x 62 . '| ^' . '<' x 75 . '~' . "\n" .

2127

' ' x 15 . '$entry,' . ' ' x 58 . '$dir_copy,' . "\n") x 20 . ".\n";

2128

2129

{

2130

# eval throws redefined warning that breaks formatting.

2131

# Turning off warnings just for the eval to fix this.

2132

no warnings 'redefine';

2133

eval $format;

2134

}

2135

2136

write;

2137

print '-' x 156 . "\nenter a command:";

2138

2139

my @tags = words_tag($feat);

2140

2141

my $final = {};

2142

my $choices;

2143

while (<STDIN>) {

2144

2145

chomp(my $choice = $_);

2146

2147

if (scalar(keys %$final) && $choice =~ /^y/i) { last

2148

2149

} elsif (scalar(keys %$final) && $choice =~ /^n/i) { curation_save($feat, $input)

2150

2151

} elsif (scalar(keys %$final)) { print "\nInvalid selection. Please try again\n";

2152

2153

} elsif ($choice eq 'all') {

2154

2155

$choice = '';

2156

for (my $i=1; $i < scalar(@tags); $i++) {

2157

$choice .= "$i,";

2158

}

2159

chop $choice;

2160

}

2161

#print "CHOICE [$choice]";

2162

2163

my @selections;

2164

for (split(/(?<=\w)[^[:alnum:]\-]+(?=\d)/, $choice)) {

2165

if ($_ =~ /(\d+)(?:\D*)-(\d+)(.*)/) {

2166

2167

for ($1..$2) { push @selections, $_ }

2168

2169

my $dangling_alphas = $3;

2170

alpha_expand($dangling_alphas, \@selections);

2171

2172

} else {

2173

alpha_expand($_, \@selections);

2174

}

2175

}

2176

2177

foreach my $numbers (@selections) {

2178

2179

my @c = split(/(?=[\w])/, $numbers);

2180

s/\W+//g foreach @c;

2181

my $num;

2182

2183

{

2184

$^W = 0;

2185

$num = 0 + shift @c;

2186

}

2187

2188

my $tag = $tags[$num];

2189

if (ref $tag && scalar(@c)) {

2190

my $no_value;

2191

foreach (@c) {

2192

if (defined $tag->{$_}) {

2193

$choices .= "${num}[$_] ";

2194

my ($t,$v) = @{$tag->{$_}};

2195

push @{${$final->{$input}}[0]{$t}}, $v;

2196

2197

} else { $no_value++ }

2198

}

2199

2200

if ($no_value) {

2201

_selection_add($tag,$final,$input,\$choices,$num);

2202

#my ($t,$v) = @{$tag->{'all'}};

2203

#unless (defined ${$final->{$input}}[0]{$t}) {

2204

#$choices .= "$num, ";

2205

#push @{${$final->{$input}}[0]{$t}}, $v

2206

2207

}

2208

2209

$choices = substr($choices, 0, -2);

2210

$choices .= ', ';

2211

2212

} elsif (ref $tag) {

2213

_selection_add($tag,$final,$input,\$choices,$num);

2214

#my ($t,$v) = @{$tag->{'all'}};

2215

#unless (defined ${$final->{$input}}[0]{$t}) {

2216

#$choices .= "$num, ";

2217

#push @{${$final->{$input}}[0]{$t}}, $v

2218

2219

}

2220

}

2221

$choices = substr($choices, 0, -2) if $choices;

2222

if ($final) {

2223

print "\nYou have chosen the following tags:\n$choices\n";

2224

print "This will be written to the config file as:\n";

2225

print Dump $final;

2226

print "\nIs this correct? (y or n)\n";

2227

} else { print "\nInvalid selection. Please try again\n" }

2228

}

2229

push @{$YAML->{$input}}, $final->{$input};

2230

conf_write();

2231

}

2232

2233

# words_tag() splits each tag value string into multiple words so that the

2234

# user can select the parts he/she wants to use for curation

2235

# it can tag 702 (a - zz) separate words; this should be enough

2236

2237

sub words_tag {

2238

2239

my ($feat, $entry) = @_;

2240

2241

my @tags;

2242

2243

@tags[1,2] = ({'all' => ['primary_tag', $feat->primary_tag]}, {'all' => ['location', $feat->start.'..'.$feat->end]});

2244

my $i = 3;

2245

foreach my $tag ($feat->all_tags) {

2246

foreach my $value ($feat->each_tag_value($tag)) {

2247

2248

my ($string, $tagged_string);

2249

2250

my @words = split(/(?=\w+?)/, $value);

2251

2252

my $pos = 0;

2253

2254

2255

foreach my $word (@words) {

2256

2257

(my $sanitized_word = $word) =~ s/\W+?//g;

2258

$string .= $word;

2259

2260

my $lead = int($pos/ALPHABET_DIVISOR);

2261

my $lag = $pos % ALPHABET_DIVISOR;

2262

2263

my $a = $lead ? ${(ALPHABET)}[$lead-1] : '';

2264

$a .= $lag ? ${(ALPHABET)}[$lag] : 'a';

2265

2266

$tagged_string .= " ($a) $word";

2267

2268

$tags[$i]{$a} = [$tag, $sanitized_word];

2269

$pos++;

2270

}

2271

2272

$value = $tagged_string if scalar(@words) > 1;

2273

2274

$$entry .= "[$i] /$tag=\"$value\"\r";

2275

2276

$tags[$i]{'all'} = [$tag, $string];

2277

}

2278

$i++;

2279

}

2280

2281

return @tags;

2282

2283

}

2284

2285

sub alpha_expand {

2286

2287

my ($dangling_alphas, $selections) = @_;

2288

2289

if (defined($dangling_alphas) && $dangling_alphas =~ /(\d)*([[:alpha:]]+)-([[:alpha:]]+)/) {

2290

2291

my $digit = $1;

2292

push @$selections, $digit if $digit;

2293

2294

my $start = $2;

2295

my $stop = $3;

2296

2297

my @starts = split('', $start);

2298

my @stops = split('', $stop);

2299

2300

my ($final_start, $final_stop);

2301

2302

for ([\$final_start, \@starts], [\$final_stop, \@stops]) {

2303

2304

my ($final, $splits) = @$_;

2305

2306

my $int = ${(ALPHABET_TO_NUMBER)}{$$splits[0]};

2307

my $rem;

2308

2309

2310

if ($$splits[1]) {

2311

$rem = ${(ALPHABET_TO_NUMBER)}{$$splits[1]};

2312

$int++

2313

} else {

2314

$rem = $int;

2315

$int = 0;

2316

}

2317

2318

2319

$$final = $int * ALPHABET_DIVISOR;

2320

$$final += $rem;

2321

2322

}

2323

2324

my $last_number = pop @$selections;

2325

for my $pos ($final_start..$final_stop) {

2326

my $lead = int($pos/ALPHABET_DIVISOR);

2327

my $lag = $pos % ALPHABET_DIVISOR;

2328

my $alpha = $lead ? ${(ALPHABET)}[$lead-1] : '';

2329

$alpha .= $lag ? ${(ALPHABET)}[$lag] : 'a';

2330

push @$selections, $last_number.$alpha;

2331

}

2332

2333

} elsif (defined($dangling_alphas)) {

2334

if ($dangling_alphas =~ /^\d/) {

2335

push @$selections, $dangling_alphas;

2336

} elsif ($dangling_alphas =~ /^\D/) {

2337

#print "$dangling_alphas ".Dumper @$selections;

2338

my $last_number = pop @$selections;

2339

$last_number ||= '';

2340

push @$selections, $last_number.$dangling_alphas;

2341

#$$selections[-1] .= $dangling_alphas;

2342

}

2343

}

2344

2345

}

2346

2347

sub _selection_add {

2348

2349

my ($tag, $final, $input, $choices, $num) = @_;

2350

my ($t,$v) = @{$tag->{'all'}};

2351

unless (defined ${$final->{$input}}[0]{$t}) {

2352

$$choices .= "$num, ";

2353

push @{${$final->{$input}}[0]{$t}}, $v

2354

}

2355

2356

}

Older »