~ubuntu-branches/ubuntu/trusty/bioperl/trusty

Viewing changes to scripts/Bio-DB-GFF/genbank2gff3.PLS

Committer: Package Import Robot
Author(s): Charles Plessy
Date: 2013-09-22 13:39:48 UTC
mfrom: (3.1.11 sid)
Revision ID: package-import@ubuntu.com-20130922133948-c6z62zegjyp7ztou

Tags: 1.6.922-1

http://bugs.debian.org/722910

* New upstream release.
* Replaces and Breaks grinder (<< 0.5.3-3~) because of overlaping contents.
Closes: #722910
* Stop Replacing and Breaking bioperl ( << 1.6.9 ): not needed anymore.

files added:
.travis.yml

Bio/DB/IndexedBase.pm

Bio/DB/Taxonomy/greengenes.pm

Bio/DB/Taxonomy/silva.pm

Bio/Seq/SimulatedRead.pm

Bio/SeqFeature/Amplicon.pm

Bio/SeqFeature/SubSeq.pm

Bio/Tools/AmpliconSearch.pm

README.md

examples/align/FastAlign.pl

scripts/Bio-DB-GFF/bp_bulk_load_gff.pl

scripts/Bio-DB-GFF/bp_fast_load_gff.pl

scripts/Bio-DB-GFF/bp_genbank2gff.pl

scripts/Bio-DB-GFF/bp_genbank2gff3.pl

scripts/Bio-DB-GFF/bp_generate_histogram.pl

scripts/Bio-DB-GFF/bp_load_gff.pl

scripts/Bio-DB-GFF/bp_meta_gff.pl

scripts/Bio-DB-GFF/bp_process_gadfly.pl

scripts/Bio-DB-GFF/bp_process_sgd.pl

scripts/Bio-DB-GFF/bp_process_wormbase.pl

scripts/Bio-DB-SeqFeature-Store/bp_seqfeature_delete.pl

scripts/Bio-DB-SeqFeature-Store/bp_seqfeature_gff3.pl

scripts/Bio-DB-SeqFeature-Store/bp_seqfeature_load.pl

scripts/DB-HIV/bp_hivq.pl

scripts/DB/bp_biofetch_genbank_proxy.pl

scripts/DB/bp_bioflat_index.pl

scripts/DB/bp_biogetseq.pl

scripts/DB/bp_flanks.pl

scripts/das/bp_das_server.pl

scripts/index/bp_fetch.pl

scripts/index/bp_index.pl

scripts/index/bp_seqret.pl

scripts/popgen/bp_composite_LD.pl

scripts/popgen/bp_heterogeneity_test.pl

scripts/searchio/bp_fastam9_to_table.pl

scripts/searchio/bp_filter_search.pl

scripts/searchio/bp_hmmer_to_table.pl

scripts/searchio/bp_parse_hmmsearch.pl

scripts/searchio/bp_search2table.pl

scripts/seq/bp_extract_feature_seq.pl

scripts/seq/bp_make_mrna_protein.pl

scripts/seq/bp_seqconvert.pl

scripts/seq/bp_seqcut.pl

scripts/seq/bp_seqpart.pl

scripts/seq/bp_seqretsplit.pl

scripts/seq/bp_split_seq.pl

scripts/seq/bp_translate_seq.pl

scripts/seq/bp_unflatten_seq.pl

scripts/seqstats/bp_aacomp.pl

scripts/seqstats/bp_chaos_plot.pl

scripts/seqstats/bp_gccalc.pl

scripts/seqstats/bp_oligo_count.pl

scripts/taxa/bp_classify_hits_kingdom.pl

scripts/taxa/bp_local_taxonomydb_query.pl

scripts/taxa/bp_query_entrez_taxa.pl

scripts/taxa/bp_taxid4species.pl

scripts/taxa/bp_taxonomy2tree.pl

scripts/tree/bp_blast2tree.pl

scripts/tree/bp_nexus2nh.pl

scripts/tree/bp_tree2pag.pl

scripts/utilities/bp_dbsplit.pl

scripts/utilities/bp_download_query_genbank.pl

scripts/utilities/bp_mask_by_search.pl

scripts/utilities/bp_mrtrans.pl

scripts/utilities/bp_mutate.pl

scripts/utilities/bp_netinstall.pl

scripts/utilities/bp_nrdb.pl

scripts/utilities/bp_pairwise_kaks.pl

scripts/utilities/bp_remote_blast.pl

scripts/utilities/bp_revtrans-motif.pl

scripts/utilities/bp_search2BSML.pl

scripts/utilities/bp_search2alnblocks.pl

scripts/utilities/bp_search2gff.pl

scripts/utilities/bp_search2tribe.pl

scripts/utilities/bp_seq_length.pl

scripts/utilities/bp_sreformat.pl

t/Cluster

t/Cluster/UniGene.t

t/LocalDB/Fasta.t

t/LocalDB/Qual.t

t/LocalDB/Taxonomy

t/LocalDB/Taxonomy/greengenes.t

t/LocalDB/Taxonomy/silva.t

t/Seq/SimulatedRead.t

t/SeqFeature/Amplicon.t

t/SeqFeature/Collection.t

t/SeqFeature/Computation.t

t/SeqFeature/FeaturePair.t

t/SeqFeature/Gene.t

t/SeqFeature/Generic.t

t/SeqFeature/SubSeq.t

t/Tools/AmpliconSearch.t

t/data/NC_000007-ribosomal-slippage.gb

t/data/NC_002058_multDBLINK_bug3375.gb

t/data/bad_dbfa/shotdb.fa

t/data/badfasta.fa

t/data/bug3331.mlc

t/data/codeml45.mlc

t/data/codeml45b.mlc

t/data/dbfa/mixed_alphabet.fasta

t/data/exonerate.output.negativescore.works

t/data/fastq/zero_qual.fastq

t/data/forward_primer.fa

t/data/forward_reverse_primers.fa

t/data/hmmpfam_HSPdashline.txt

t/data/hmmpfam_multiresult.out

t/data/msout/bad_msout_infile1

t/data/msout/bad_msout_infile2

t/data/multiresult_blastn+.bls

t/data/nhmmer-3.1.out

t/data/pfamOutput-bug3376.out

t/data/rpsblast_no_hits.bls

t/data/singlescore.gbk

t/data/spaced_fasta.fa

t/data/taxonomy

t/data/taxonomy/greengenes_taxonomy_16S_candiv_gg_2011_1.txt

t/data/taxonomy/silva_SSURef_108_tax_silva_trunc.fasta

t/data/test-3.0-1.meme

t/data/test-3.0-2.meme

t/data/test-4.9.meme

t/data/tiny.stk

t/data/yn00_45.mlc

travis_scripts

travis_scripts/dependency_installs

files removed:
.pc/fix_berkeleydb3_temp_file_creation

.pc/fix_berkeleydb3_temp_file_creation/Bio

.pc/fix_berkeleydb3_temp_file_creation/Bio/DB

.pc/fix_berkeleydb3_temp_file_creation/Bio/DB/SeqFeature

.pc/fix_berkeleydb3_temp_file_creation/Bio/DB/SeqFeature/Store

.pc/fix_berkeleydb3_temp_file_creation/Bio/DB/SeqFeature/Store/berkeleydb3.pm

Bio/Biblio

Bio/Biblio.pm

Bio/Biblio/Article.pm

Bio/Biblio/BiblioBase.pm

Bio/Biblio/Book.pm

Bio/Biblio/BookArticle.pm

Bio/Biblio/IO

Bio/Biblio/IO.pm

Bio/Biblio/IO/medline2ref.pm

Bio/Biblio/IO/medlinexml.pm

Bio/Biblio/IO/pubmed2ref.pm

Bio/Biblio/IO/pubmedxml.pm

Bio/Biblio/Journal.pm

Bio/Biblio/JournalArticle.pm

Bio/Biblio/MedlineArticle.pm

Bio/Biblio/MedlineBook.pm

Bio/Biblio/MedlineBookArticle.pm

Bio/Biblio/MedlineJournal.pm

Bio/Biblio/MedlineJournalArticle.pm

Bio/Biblio/Organisation.pm

Bio/Biblio/Patent.pm

Bio/Biblio/Person.pm

Bio/Biblio/Proceeding.pm

Bio/Biblio/Provider.pm

Bio/Biblio/PubmedArticle.pm

Bio/Biblio/PubmedBookArticle.pm

Bio/Biblio/PubmedJournalArticle.pm

Bio/Biblio/Ref.pm

Bio/Biblio/Service.pm

Bio/Biblio/TechReport.pm

Bio/Biblio/Thesis.pm

Bio/Biblio/WebResource.pm

Bio/DB/Biblio

Bio/DB/Biblio/biofetch.pm

Bio/DB/Biblio/eutils.pm

Bio/DB/Biblio/soap.pm

Bio/DB/BiblioI.pm

Bio/DB/EUtilities.pm

Bio/FeatureIO

Bio/FeatureIO.pm

Bio/FeatureIO/bed.pm

Bio/FeatureIO/gff.pm

Bio/FeatureIO/gtf.pm

Bio/FeatureIO/interpro.pm

Bio/FeatureIO/ptt.pm

Bio/FeatureIO/vecscreen_simple.pm

Bio/Ontology/SimpleGOEngine/GraphAdaptor02.pm

Bio/Root/Test

Bio/Root/Test/Warn.pm

Bio/Search/HSP/hmmer3HSP.pm

Bio/SeqFeature/Annotated.pm

Bio/Tools/EUtilities

Bio/Tools/EUtilities.pm

Bio/Tools/EUtilities/EUtilDataI.pm

Bio/Tools/EUtilities/EUtilParameters.pm

Bio/Tools/EUtilities/History.pm

Bio/Tools/EUtilities/HistoryI.pm

Bio/Tools/EUtilities/Info

Bio/Tools/EUtilities/Info.pm

Bio/Tools/EUtilities/Info/FieldInfo.pm

Bio/Tools/EUtilities/Info/LinkInfo.pm

Bio/Tools/EUtilities/Link

Bio/Tools/EUtilities/Link.pm

Bio/Tools/EUtilities/Link/LinkSet.pm

Bio/Tools/EUtilities/Link/UrlLink.pm

Bio/Tools/EUtilities/Query

Bio/Tools/EUtilities/Query.pm

Bio/Tools/EUtilities/Query/GlobalQuery.pm

Bio/Tools/EUtilities/Summary

Bio/Tools/EUtilities/Summary.pm

Bio/Tools/EUtilities/Summary/DocSum.pm

Bio/Tools/EUtilities/Summary/Item.pm

Bio/Tools/EUtilities/Summary/ItemContainerI.pm

debian/patches/fix_berkeleydb3_temp_file_creation

examples/biblio

examples/biblio/biblio-eutils-example.pl

examples/biblio/biblio-soap-example.pl

examples/biblio/biblio_soap.pl

scripts/Bio-DB-EUtilities

scripts/Bio-DB-EUtilities/einfo.PLS

scripts/Bio-DB-GFF/bulk_load_gff.PLS

scripts/Bio-DB-GFF/fast_load_gff.PLS

scripts/Bio-DB-GFF/genbank2gff.PLS

scripts/Bio-DB-GFF/genbank2gff3.PLS

scripts/Bio-DB-GFF/generate_histogram.PLS

scripts/Bio-DB-GFF/load_gff.PLS

scripts/Bio-DB-GFF/meta_gff.PLS

scripts/Bio-DB-GFF/process_gadfly.PLS

scripts/Bio-DB-GFF/process_sgd.PLS

scripts/Bio-DB-GFF/process_wormbase.PLS

scripts/Bio-DB-SeqFeature-Store/bp_seqfeature_delete.PLS

scripts/Bio-DB-SeqFeature-Store/bp_seqfeature_gff3.PLS

scripts/Bio-DB-SeqFeature-Store/bp_seqfeature_load.PLS

scripts/DB-HIV/hivq.PLS

scripts/DB/biofetch_genbank_proxy.PLS

scripts/DB/bioflat_index.PLS

scripts/DB/biogetseq.PLS

scripts/DB/flanks.PLS

scripts/biblio

scripts/biblio/TAG

scripts/biblio/biblio.PLS

scripts/das/das_server.pl

scripts/index/bp_fetch.PLS

scripts/index/bp_index.PLS

scripts/index/bp_seqret.PLS

scripts/popgen/composite_LD.PLS

scripts/popgen/heterogeneity_test.PLS

scripts/searchio/fastam9_to_table.PLS

scripts/searchio/filter_search.PLS

scripts/searchio/hmmer_to_table.PLS

scripts/searchio/parse_hmmsearch.PLS

scripts/searchio/search2table.PLS

scripts/seq/extract_feature_seq.PLS

scripts/seq/make_mrna_protein.PLS

scripts/seq/seqconvert.PLS

scripts/seq/seqretsplit.PLS

scripts/seq/split_seq.PLS

scripts/seq/translate_seq.PLS

scripts/seq/unflatten_seq.PLS

scripts/seqstats/aacomp.PLS

scripts/seqstats/chaos_plot.PLS

scripts/seqstats/gccalc.PLS

scripts/seqstats/oligo_count.PLS

scripts/taxa/classify_hits_kingdom.PLS

scripts/taxa/local_taxonomydb_query.PLS

scripts/taxa/query_entrez_taxa.PLS

scripts/taxa/taxid4species.PLS

scripts/taxa/taxonomy2tree.PLS

scripts/tree/blast2tree.PLS

scripts/tree/nexus2nh.PLS

scripts/tree/tree2pag.PLS

scripts/utilities/bp_mrtrans.PLS

scripts/utilities/bp_netinstall.PLS

scripts/utilities/bp_nrdb.PLS

scripts/utilities/bp_sreformat.PLS

scripts/utilities/dbsplit.PLS

scripts/utilities/download_query_genbank.PLS

scripts/utilities/mask_by_search.PLS

scripts/utilities/mutate.PLS

scripts/utilities/pairwise_kaks.PLS

scripts/utilities/remote_blast.PLS

scripts/utilities/revtrans-motif.PLS

scripts/utilities/search2BSML.PLS

scripts/utilities/search2alnblocks.PLS

scripts/utilities/search2gff.PLS

scripts/utilities/search2tribe.PLS

scripts/utilities/seq_length.PLS

t/Biblio

t/Biblio/Biblio.t

t/Biblio/References.t

t/Biblio/biofetch.t

t/Biblio/eutils.t

t/LocalDB/DBFasta.t

t/LocalDB/DBQual.t

t/RemoteDB/EUtilities.t

t/SeqFeature/FeatureIO.t

t/SeqFeature/SeqFeatAnnotated.t

t/SeqFeature/SeqFeatCollection.t

t/SeqFeature/SeqFeature.t

t/SeqFeature/SeqFeaturePrimer.t

t/Tools/EUtilities

t/Tools/EUtilities/EUtilParameters.t

t/Tools/EUtilities/egquery.t

t/Tools/EUtilities/einfo.t

t/Tools/EUtilities/elink_acheck.t

t/Tools/EUtilities/elink_lcheck.t

t/Tools/EUtilities/elink_llinks.t

t/Tools/EUtilities/elink_ncheck.t

t/Tools/EUtilities/elink_neighbor.t

t/Tools/EUtilities/elink_neighbor_history.t

t/Tools/EUtilities/elink_scores.t

t/Tools/EUtilities/epost.t

t/Tools/EUtilities/esearch.t

t/Tools/EUtilities/espell.t

t/Tools/EUtilities/esummary.t

t/data/1.bed

t/data/directives.gff3

t/data/eutils

t/data/eutils/egquery.xml

t/data/eutils/einfo.xml

t/data/eutils/einfo_dbs.xml

t/data/eutils/elink_acheck.xml

t/data/eutils/elink_acheck_corr.xml

t/data/eutils/elink_dball.xml

t/data/eutils/elink_lcheck.xml

t/data/eutils/elink_lcheck_corr.xml

t/data/eutils/elink_llinks.xml

t/data/eutils/elink_llinks_corr.xml

t/data/eutils/elink_multidb.xml

t/data/eutils/elink_multidb_corr.xml

t/data/eutils/elink_ncheck.xml

t/data/eutils/elink_ncheck_corr.xml

t/data/eutils/elink_neighbor.xml

t/data/eutils/elink_neighbor_corr.xml

t/data/eutils/elink_nhist.xml

t/data/eutils/elink_nhist_corr.xml

t/data/eutils/elink_scores.xml

t/data/eutils/epost.xml

t/data/eutils/esearch1.xml

t/data/eutils/esearch2.xml

t/data/eutils/espell.xml

t/data/eutils/esummary1.xml

t/data/eutils/esummary2.xml

t/data/hybrid1.gff3

t/data/knownGene.gff3

t/data/stress_test_medline.xml

t/data/stress_test_pubmed.xml

t/data/test.meme

t/data/test.meme2

t/data/test.ptt

t/data/vecscreen_simple.test_output

t/lib/Array

t/lib/Array/Compare.pm

t/lib/Sub

t/lib/Sub/Uplevel.pm

t/lib/Test

t/lib/Test/Builder

t/lib/Test/Builder.pm

t/lib/Test/Builder/Module.pm

t/lib/Test/Builder/Tester

t/lib/Test/Builder/Tester.pm

t/lib/Test/Builder/Tester/Color.pm

t/lib/Test/Exception.pm

t/lib/Test/Harness

t/lib/Test/Harness.pm

t/lib/Test/Harness/Assert.pm

t/lib/Test/Harness/Iterator.pm

t/lib/Test/Harness/Point.pm

t/lib/Test/Harness/Results.pm

t/lib/Test/Harness/Straps.pm

t/lib/Test/Harness/TAP.pod

t/lib/Test/Harness/Util.pm

t/lib/Test/More.pm

t/lib/Test/Simple.pm

t/lib/Test/Tutorial.pod

t/lib/Test/Warn.pm

t/lib/Tree

t/lib/Tree/DAG_Node.pm

files modified:
.pc/applied-patches

.pc/fix_examples_perl_location/examples/db/getGenBank.pl

Bio/Align/Utilities.pm

Bio/AlignIO.pm

Bio/AlignIO/fasta.pm

Bio/AlignIO/largemultifasta.pm

Bio/AlignIO/meme.pm

Bio/AlignIO/phylip.pm

Bio/AlignIO/stockholm.pm

Bio/Assembly/IO.pm

Bio/Assembly/IO/ace.pm

Bio/Assembly/IO/bowtie.pm

Bio/Assembly/IO/sam.pm

Bio/Assembly/Tools/ContigSpectrum.pm

Bio/Cluster/SequenceFamily.pm

Bio/ClusterIO.pm

Bio/DB/Ace.pm

Bio/DB/CUTG.pm

Bio/DB/Fasta.pm

Bio/DB/Flat/BinarySearch.pm

Bio/DB/GenBank.pm

Bio/DB/HIV/HIVQueryHelper.pm

Bio/DB/NCBIHelper.pm

Bio/DB/Qual.pm

Bio/DB/Query/HIVQuery.pm

Bio/DB/Query/WebQuery.pm

Bio/DB/RandomAccessI.pm

Bio/DB/SeqFeature/Store.pm

Bio/DB/SeqFeature/Store/DBI/Pg.pm

Bio/DB/SeqFeature/Store/DBI/SQLite.pm

Bio/DB/SeqFeature/Store/DBI/mysql.pm

Bio/DB/SeqFeature/Store/LoadHelper.pm

Bio/DB/SeqFeature/Store/berkeleydb.pm

Bio/DB/SeqFeature/Store/berkeleydb3.pm

Bio/DB/SeqFeature/Store/memory.pm

Bio/DB/SeqI.pm

Bio/DB/SeqVersion/gi.pm

Bio/DB/SwissProt.pm

Bio/DB/TFBS/transfac_pro.pm

Bio/DB/Taxonomy.pm

Bio/DB/Taxonomy/entrez.pm

Bio/DB/Taxonomy/flatfile.pm

Bio/DB/Taxonomy/list.pm *

Bio/DB/WebDBSeqI.pm

Bio/FeatureHolderI.pm

Bio/Index/Blast.pm

Bio/Index/Hmmer.pm

Bio/Index/Qual.pm

Bio/Index/Stockholm.pm

Bio/LiveSeq/SeqI.pm

Bio/LocatableSeq.pm

Bio/Location/Simple.pm

Bio/Map/Mappable.pm

Bio/MapIO.pm

Bio/Matrix/IO.pm

Bio/Nexml/Factory.pm

Bio/Ontology/SimpleGOEngine/GraphAdaptor.pm

Bio/OntologyIO.pm

Bio/Perl.pm

Bio/PhyloNetwork.pm

Bio/PopGen/Genotype.pm

Bio/PopGen/IO.pm

Bio/PopGen/IO/hapmap.pm

Bio/PopGen/IO/phase.pm

Bio/PopGen/Individual.pm

Bio/PopGen/Marker.pm

Bio/PopGen/Population.pm

Bio/PopGen/Statistics.pm

Bio/PopGen/Utilities.pm

Bio/PrimarySeq.pm

Bio/PrimarySeqI.pm

Bio/Restriction/IO.pm

Bio/Root/Build.pm

Bio/Root/IO.pm

Bio/Root/Root.pm

Bio/Root/Test.pm

Bio/Root/Utilities.pm

Bio/Root/Version.pm

Bio/Search/HSP/GenericHSP.pm

Bio/Search/HSP/HMMERHSP.pm

Bio/Search/HSP/HSPI.pm

Bio/Search/Hit/ModelHit.pm

Bio/SearchIO.pm

Bio/SearchIO/blast.pm

Bio/SearchIO/blasttable.pm

Bio/SearchIO/blastxml.pm

Bio/SearchIO/cross_match.pm

Bio/SearchIO/exonerate.pm

Bio/SearchIO/hmmer.pm

Bio/SearchIO/hmmer2.pm

Bio/SearchIO/hmmer3.pm

Bio/SearchIO/psl.pm

Bio/Seq.pm

Bio/Seq/EncodedSeq.pm

Bio/Seq/Meta.pm

Bio/Seq/Meta/Array.pm

Bio/Seq/PrimaryQual.pm

Bio/Seq/PrimedSeq.pm

Bio/Seq/SeqFactory.pm

Bio/Seq/SeqFastaSpeedFactory.pm

Bio/SeqFeature/AnnotationAdaptor.pm

Bio/SeqFeature/Computation.pm

Bio/SeqFeature/FeaturePair.pm

Bio/SeqFeature/Generic.pm

Bio/SeqFeature/Lite.pm

Bio/SeqFeature/PositionProxy.pm

Bio/SeqFeature/Primer.pm

Bio/SeqFeature/SiRNA/Pair.pm

Bio/SeqFeature/Similarity.pm

Bio/SeqFeature/Tools/Unflattener.pm

Bio/SeqI.pm

Bio/SeqIO.pm

Bio/SeqIO/FTHelper.pm

Bio/SeqIO/agave.pm

Bio/SeqIO/bsml.pm

Bio/SeqIO/chadoxml.pm

Bio/SeqIO/embl.pm

Bio/SeqIO/entrezgene.pm

Bio/SeqIO/fasta.pm

Bio/SeqIO/fastq.pm

Bio/SeqIO/game/gameWriter.pm

Bio/SeqIO/gbxml.pm

Bio/SeqIO/genbank.pm

Bio/SeqIO/interpro.pm

Bio/SeqIO/msout.pm

Bio/SeqIO/phd.pm

Bio/SeqIO/qual.pm

Bio/SeqIO/raw.pm

Bio/SeqIO/scf.pm

Bio/SeqIO/strider.pm

Bio/SeqIO/tab.pm

Bio/SeqUtils.pm

Bio/SimpleAlign.pm

Bio/Structure/IO.pm

Bio/Taxon.pm

Bio/Tools/Alignment/Consed.pm

Bio/Tools/Analysis/DNA/ESEfinder.pm

Bio/Tools/Analysis/Protein/ELM.pm

Bio/Tools/Analysis/Protein/NetPhos.pm

Bio/Tools/GFF.pm

Bio/Tools/Gel.pm

Bio/Tools/Geneid.pm

Bio/Tools/GuessSeqFormat.pm

Bio/Tools/IUPAC.pm

Bio/Tools/Match.pm

Bio/Tools/Phylo/Gerp.pm

Bio/Tools/Phylo/PAML.pm

Bio/Tools/Phylo/PAML/Codeml.pm

Bio/Tools/Primer3.pm

Bio/Tools/Protparam.pm

Bio/Tools/Run/StandAloneBlast.pm

Bio/Tools/Run/WrapperBase.pm

Bio/Tools/SeqPattern.pm

Bio/Tree/Node.pm

Bio/Tree/Statistics.pm

Bio/Tree/Tree.pm

Bio/Tree/TreeFunctionsI.pm

Bio/TreeIO.pm

Bio/TreeIO/newick.pm

Bio/TreeIO/svggraph.pm

Bio/Variation/IO.pm

Bio/Variation/IO/xml.pm

Build.PL *

Changes

INSTALL

MANIFEST

META.json

META.yml

README

debian/changelog

debian/compat

debian/control

debian/copyright

debian/patches/series

examples/db/getGenBank.pl

t/Align/AlignUtil.t

t/Align/SimpleAlign.t

t/AlignIO/AlignIO.t

t/AlignIO/fasta.t

t/AlignIO/meme.t

t/AlignIO/phylip.t

t/AlignIO/stockholm.t

t/Annotation/Annotation.t

t/Annotation/AnnotationAdaptor.t

t/Assembly/ContigSpectrum.t

t/Assembly/IO/bowtie.t

t/Assembly/IO/sam.t

t/Assembly/core.t

t/ClusterIO/SequenceFamily.t

t/LocalDB/Flat.t

t/LocalDB/Index/Index.t

t/LocalDB/SeqFeature.t

t/Map/Map.t

t/Map/Physical.t

t/Ontology/IO/go.t

t/Ontology/IO/obo.t

t/PopGen/MK.t

t/PopGen/PopGen.t

t/RemoteDB/BioFetch.t

t/RemoteDB/CUTG.t

t/RemoteDB/EMBL.t

t/RemoteDB/GenBank.t

t/RemoteDB/GenPept.t

t/RemoteDB/HIV/HIVQuery.t

t/RemoteDB/SeqVersion.t

t/RemoteDB/SwissProt.t

t/RemoteDB/Taxonomy.t

t/Root/RootIO.t

t/SearchIO/Tiling.t *

t/SearchIO/blast.t

t/SearchIO/blasttable.t

t/SearchIO/exonerate.t

t/SearchIO/hmmer.t

t/Seq/EncodedSeq.t

t/Seq/LocatableSeq.t

t/Seq/MetaSeq.t

t/Seq/PrimaryQual.t

t/Seq/PrimarySeq.t

t/Seq/PrimedSeq.t

t/Seq/Quality.t

t/Seq/Seq.t

t/SeqFeature/Primer.t

t/SeqFeature/Unflattener.t

t/SeqIO/SeqBuilder.t

t/SeqIO/SeqIO.t

t/SeqIO/Splicedseq.t

t/SeqIO/embl.t

t/SeqIO/fasta.t

t/SeqIO/fastq.t

t/SeqIO/genbank.t

t/SeqIO/msout.t

t/SeqTools/CodonTable.t

t/SeqTools/SeqUtils.t

t/Tools/Analysis/Protein/Domcut.t

t/Tools/Analysis/Protein/HNN.t

t/Tools/GFF.t

t/Tools/Genewise.t

t/Tools/Genomewise.t

t/Tools/IUPAC.t

t/Tools/Match.t

t/Tools/Phylo/Gerp.t

t/Tools/Phylo/PAML.t

t/Tools/Primer3.t

t/Tools/Run/RemoteBlast.t

t/Tools/Run/WBCommandExts.t

t/Tools/Run/WrapperBase.t

t/Tree/Tree.t

t/Tree/TreeStatistics.t

t/data/bootstrap.tre

t/data/msout/msout_infile1

t/data/msout/msout_infile2

t/data/msout/msout_infile3

t/data/msout/msout_infile4

t/data/primedseq.fa

t/data/taxdump/names.dmp

t/data/taxdump/nodes.dmp

t/data/testaln.stockholm

t/data/traits.tab

Show diffs side-by-side

added added

removed removed

scripts/Bio-DB-GFF/genbank2gff3.PLS

#!/usr/bin/perl

#$Id$;

=pod

=head1 NAME

genbank2gff3.pl -- Genbank-E<gt>gbrowse-friendly GFF3

=head1 SYNOPSIS

genbank2gff3.pl [options] filename(s)

# process a directory containing GenBank flatfiles

perl genbank2gff3.pl --dir path_to_files --zip

# process a single file, ignore explicit exons and introns

perl genbank2gff3.pl --filter exon --filter intron file.gbk.gz

# process a list of files

perl genbank2gff3.pl *gbk.gz

# process data from URL, with Chado GFF model (-noCDS), and pipe to database loader

curl ftp://ftp.ncbi.nih.gov/genomes/Saccharomyces_cerevisiae/CHR_X/NC_001142.gbk \

| perl genbank2gff3.pl -noCDS -in stdin -out stdout \

| perl gmod_bulk_load_gff3.pl -dbname mychado -organism fromdata

Options:

--noinfer -r don't infer exon/mRNA subfeatures

--conf -i path to the curation configuration file that contains user preferences

for Genbank entries (must be YAML format)

(if --manual is passed without --ini, user will be prompted to

create the file if any manual input is saved)

--sofile -l path to to the so.obo file to use for feature type mapping

(--sofile live will download the latest online revision)

--manual -m when trying to guess the proper SO term, if more than

one option matches the primary tag, the converter will

wait for user input to choose the correct one

(only works with --sofile)

--dir -d path to a list of genbank flatfiles

--outdir -o location to write GFF files (can be 'stdout' or '-' for pipe)

--zip -z compress GFF3 output files with gzip

--summary -s print a summary of the features in each contig

--filter -x genbank feature type(s) to ignore

--split -y split output to separate GFF and fasta files for

each genbank record

--nolump -n separate file for each reference sequence

(default is to lump all records together into one

output file for each input file)

--ethresh -e error threshold for unflattener

set this high (>2) to ignore all unflattener errors

--[no]CDS -c Keep CDS-exons, or convert to alternate gene-RNA-protein-exon

model. --CDS is default. Use --CDS to keep default GFF gene model,

use --noCDS to convert to g-r-p-e.

--format -f Input format (SeqIO types): GenBank, Swiss or Uniprot, EMBL work

(GenBank is default)

--GFF_VERSION 3 is default, 2 and 2.5 and other Bio::Tools::GFF versions available

--quiet don't talk about what is being processed

--typesource SO sequence type for source (e.g. chromosome; region; contig)

--help -h display this message

=head1 DESCRIPTION

This script uses Bio::SeqFeature::Tools::Unflattener and

Bio::Tools::GFF to convert GenBank flatfiles to GFF3 with gene

containment hierarchies mapped for optimal display in gbrowse.

The input files are assumed to be gzipped GenBank flatfiles for refseq

contigs. The files may contain multiple GenBank records. Either a

single file or an entire directory can be processed. By default, the

DNA sequence is embedded in the GFF but it can be saved into separate

fasta file with the --split(-y) option.

If an input file contains multiple records, the default behaviour is

to dump all GFF and sequence to a file of the same name (with .gff

appended). Using the 'nolump' option will create a separate file for

each genbank record. Using the 'split' option will create separate

GFF and Fasta files for each genbank record.

=head2 Notes

=head3 'split' and 'nolump' produce many files

In cases where the input files contain many GenBank records (for

example, the chromosome files for the mouse genome build), a very

large number of output files will be produced if the 'split' or

'nolump' options are selected. If you do have lists of files E<gt> 6000,

use the --long_list option in bp_bulk_load_gff.pl or

bp_fast_load_gff.pl to load the gff and/ or fasta files.

=head3 Designed for RefSeq

This script is designed for RefSeq genomic sequence entries. It may

work for third party annotations but this has not been tested.

But see below, Uniprot/Swissprot works, EMBL and possibly EMBL/Ensembl

if you don't mind some gene model unflattener errors (dgg).

100

101

=head3 G-R-P-E Gene Model

102

103

Don Gilbert worked this over with needs to produce GFF3 suited to

104

loading to GMOD Chado databases. Most of the changes I believe are

105

suited for general use. One main chado-specific addition is the

106

--[no]cds2protein flag

107

108

My favorite GFF is to set the above as ON by default (disable with --nocds2prot)

109

For general use it probably should be OFF, enabled with --cds2prot.

110

111

This writes GFF with an alternate, but useful Gene model,

112

instead of the consensus model for GFF3

113

114

[ gene > mRNA> (exon,CDS,UTR) ]

115

116

This alternate is

117

118

gene > mRNA > polypeptide > exon

119

120

means the only feature with dna bases is the exon. The others

121

specify only location ranges on a genome. Exon of course is a child

122

of mRNA and protein/peptide.

123

124

The protein/polypeptide feature is an important one, having all the

125

annotations of the GenBank CDS feature, protein ID, translation, GO

126

terms, Dbxrefs to other proteins.

127

128

UTRs, introns, CDS-exons are all inferred from the primary exon bases

129

inside/outside appropriate higher feature ranges. Other special gene

130

model features remain the same.

131

132

Several other improvements and bugfixes, minor but useful are included

133

134

* IO pipes now work:

135

curl ftp://ncbigenomes/... | genbank2gff3 --in stdin --out stdout | gff2chado ...

136

137

* GenBank main record fields are added to source feature, e.g. organism, date,

138

and the sourcetype, commonly chromosome for genomes, is used.

139

140

* Gene Model handling for ncRNA, pseudogenes are added.

141

142

* GFF header is cleaner, more informative.

143

--GFF_VERSION flag allows choice of v2 as well as default v3

144

145

* GFF ##FASTA inclusion is improved, and

146

CDS translation sequence is moved to FASTA records.

147

148

* FT -> GFF attribute mapping is improved.

149

150

* --format choice of SeqIO input formats (GenBank default).

151

Uniprot/Swissprot and EMBL work and produce useful GFF.

152

153

* SeqFeature::Tools::TypeMapper has a few FT -> SOFA additions

154

and more flexible usage.

155

156

=head1 TODO

157

158

=head2 Are these additions desired?

159

160

* filter input records by taxon (e.g. keep only organism=xxx or taxa level = classYYY

161

* handle Entrezgene, other non-sequence SeqIO structures (really should change

162

those parsers to produce consistent annotation tags).

163

164

=head2 Related bugfixes/tests

165

166

These items from Bioperl mail were tested (sample data generating

167

errors), and found corrected:

168

169

From: Ed Green <green <at> eva.mpg.de>

170

Subject: genbank2gff3.pl on new human RefSeq

171

Date: 2006-03-13 21:22:26 GMT

172

-- unspecified errors (sample data works now).

173

174

From: Eric Just <e-just <at> northwestern.edu>

175

Subject: bp_genbank2gff3.pl

176

Date: 2007-01-26 17:08:49 GMT

177

-- bug fixed in genbank2gff3 for multi-record handling

178

179

This error is for a /trans_splice gene that is hard to handle, and

180

unflattner/genbank2 doesn't

181

182

From: Chad Matsalla <chad <at> dieselwurks.com>

183

Subject: genbank2gff3.PLS and the unflatenner - Inconsistent order?

184

Date: 2005-07-15 19:51:48 GMT

185

186

=head1 AUTHOR

187

188

Sheldon McKay (mckays@cshl.edu)

189

190

191

192

=head2 AUTHOR of hacks for GFF2Chado loading

193

194

Don Gilbert (gilbertd@indiana.edu)

195

196

197

=cut

198

199

use strict;

200

201

use lib "$ENV{HOME}/bioperl-live";

202

# chad put this here to enable situations when this script is tested

203

# against bioperl compiled into blib along with other programs using blib

204

BEGIN {

205

unshift(@INC,'blib/lib');

206

};

207

use Pod::Usage;

208

use Bio::Root::RootI;

209

use Bio::SeqIO;

210

use File::Spec;

211

use Bio::SeqFeature::Tools::Unflattener;

212

use Bio::SeqFeature::Tools::TypeMapper;

213

use Bio::SeqFeature::Tools::IDHandler;

214

use Bio::Location::SplitLocationI;

215

use Bio::Location::Simple;

216

use Bio::Tools::GFF;

217

use Getopt::Long;

218

use List::Util qw(first);

219

use Bio::OntologyIO;

220

use YAML qw(Dump LoadFile DumpFile);

221

use File::Basename;

222

223

use vars qw/$split @filter $zip $outdir $help $ethresh

224

$ONTOLOGY %FEATURES %DESCENDANTS @RETURN $MANUAL @GFF_LINE_FEAT

225

$CONF $YAML $TYPE_MAP $SYN_MAP $noinfer $SO_FILE

226

$file @files $dir $summary $nolump

227

$source_type %proteinfa %exonpar $didheader $verbose $DEBUG $GFF_VERSION

228

$gene_id $rna_id $tnum $ncrna_id $rnum %method %id %seen/;

229

230

use constant SO_URL =>

231

'http://song.cvs.sourceforge.net/viewvc/*checkout*/song/ontology/so.obo';

232

use constant ALPHABET => [qw(a b c d e f g h i j k l m n o p q r s t u v w x y z)];

233

use constant ALPHABET_TO_NUMBER => {

234

a => 0, b => 1, c => 2, d => 3, e => 4, f => 5, g => 6, h => 7, i => 8,

235

j => 9, k => 10, l => 11, m => 12, n => 13, o => 14, p => 15, q => 16,

236

r => 17, s => 18, t => 19, u => 20, v => 21, w => 22, x => 23, y => 24,

237

z => 25,

238

};

239

use constant ALPHABET_DIVISOR => 26;

240

use constant GM_NEW_TOPLEVEL => 2;

241

use constant GM_NEW_PART => 1;

242

use constant GM_DUP_PART => 0;

243

use constant GM_NOT_PART => -1;

244

245

# Options cycle in multiples of 2 because of left side/right side pairing.

246

# You can make this number odd, but displayed matches will still round up

247

use constant OPTION_CYCLE => 6;

248

249

$GFF_VERSION = 3; # allow v2 ...

250

$verbose = 1; # right default? -nov to turn off

251

252

# dgg: change the gene model to Gene/mRNA/Polypeptide/exons...

253

my $CDSkeep= 1; # default should be ON (prior behavior), see gene_features()

254

my $PROTEIN_TYPE = 'polypeptide'; # for noCDSkeep;

255

# protein = flybase chado usage; GMOD Perls use 'polypeptide' with software support

256

257

my $FORMAT="GenBank"; # swiss ; embl; genbank ; ** guess from SOURCEID **

258

my $SOURCEID= $FORMAT; # "UniProt" "GenBank" "EMBL" should work

259

# other Bio::SeqIO formats may work. TEST: EntrezGene < problematic tags; InterPro KEGG

260

261

262

my %TAG_MAP = (

263

db_xref => 'Dbxref',

264

name => 'Name',

265

note => 'Note', # also pull GO: ids into Ontology_term

266

synonym => 'Alias',

267

symbol => 'Alias', # is symbol still used?

268

# protein_id => 'Dbxref', also seen Dbxref tags: EC_number

269

# translation: handled in gene_features

270

);

271

272

273

$| = 1;

274

my $quiet= !$verbose;

275

my $ok= GetOptions( 'd|dir|input:s' => \$dir,

276

'z|zip' => \$zip,

277

'h|help' => \$help,

278

's|summary' => \$summary,

279

'r|noinfer' => \$noinfer,

280

'i|conf=s' => \$CONF,

281

'sofile=s' => \$SO_FILE,

282

'm|manual' => \$MANUAL,

283

'o|outdir|output:s'=> \$outdir,

284

'x|filter:s'=> \@filter,

285

'y|split' => \$split,

286

"ethresh|e=s"=>\$ethresh,

287

'c|CDS!' => \$CDSkeep,

288

'f|format=s' => \$FORMAT,

289

'typesource=s' => \$source_type,

290

'GFF_VERSION=s' => \$GFF_VERSION,

291

'quiet!' => \$quiet, # swap quiet to verbose

292

'DEBUG!' => \$DEBUG,

293

'n|nolump' => \$nolump);

294

295

my $lump = 1 unless $nolump || $split;

296

$verbose= !$quiet;

297

298

# look for help request

299

pod2usage(2) if $help || !$ok;

300

301

# keep SOURCEID as-is and change FORMAT for SeqIO types;

302

# note SeqIO uses file.suffix to guess type; not useful here

303

$SOURCEID= $FORMAT;

304

$FORMAT = "swiss" if $FORMAT =~/UniProt|trembl/;

305

$verbose =1 if($DEBUG);

306

307

# initialize handlers

308

my $unflattener = Bio::SeqFeature::Tools::Unflattener->new; # for ensembl genomes (-trust_grouptag=>1);

309

$unflattener->error_threshold($ethresh) if $ethresh;

310

$unflattener->verbose(1) if($DEBUG);

311

# $unflattener->group_tag('gene') if($FORMAT =~ /embl/i) ; #? ensembl only?

312

# ensembl parsing is still problematic, forget this

313

314

my $tm = Bio::SeqFeature::Tools::TypeMapper->new;

315

my $idh = Bio::SeqFeature::Tools::IDHandler->new;

316

317

# dgg

318

$source_type ||= "region"; # should really parse from FT.source contents below

319

320

#my $FTSOmap = $tm->FT_SO_map();

321

my $FTSOmap;

322

my $FTSOsynonyms;

323

324

if (defined($SO_FILE) && $SO_FILE eq 'live') {

325

print "\nDownloading the latest SO file from ".SO_URL."\n\n";

326

use LWP::UserAgent;

327

my $ua = LWP::UserAgent->new(timeout => 30);

328

my $request = HTTP::Request->new(GET => SO_URL);

329

my $response = $ua->request($request);

330

331

332

if ($response->status_line =~ /200/) {

333

use File::Temp qw/ tempfile /;

334

my ($fh, $fn) = tempfile();

335

print $fh $response->content;

336

$SO_FILE = $fn;

337

} else {

338

print "Couldn't download SO file online...skipping validation.\n"

339

. "HTTP Status was " . $response->status_line . "\n"

340

and undef $SO_FILE

341

}

342

}

343

344

if ($SO_FILE) {

345

346

347

my (%terms, %syn);

348

349

my $parser = Bio::OntologyIO->new( -format => "obo", -file => $SO_FILE );

350

$ONTOLOGY = $parser->next_ontology();

351

352

for ($ONTOLOGY->get_all_terms) {

353

my $feat = $_;

354

355

$terms{$feat->name} = $feat->name;

356

#$terms{$feat->name} = $feat;

357

358

my @syn = $_->each_synonym;

359

360

push @{$syn{$_}}, $feat->name for @syn;

361

#push @{$syn{$_}}, $feat for @syn;

362

}

363

364

$FTSOmap = \%terms;

365

$FTSOsynonyms = \%syn;

366

367

my %hardTerms = %{ $tm->FT_SO_map() };

368

map { $FTSOmap->{$_} ||= $hardTerms{$_} } keys %hardTerms;

369

370

} else {

371

my %terms = %{ $tm->FT_SO_map() };

372

while (my ($k,$v) = each %terms) {

373

$FTSOmap->{$k} = ref($v) ? shift @$v : $v;

374

}

375

}

376

377

$TYPE_MAP = $FTSOmap;

378

$SYN_MAP = $FTSOsynonyms;

379

380

381

# #convert $FTSOmap undefined to valid SO : moved to TypeMapper->map_types( -undefined => "region")

382

383

# stringify filter list if applicable

384

my $filter = join ' ', @filter if @filter;

385

386

# determine input files

387

my $stdin=0; # dgg: let dir == stdin == '-' for pipe use

388

if ($dir && ($dir eq '-' || $dir eq 'stdin')) {

389

$stdin=1; $dir=''; @files=('stdin');

390

391

} elsif ( $dir ) {

392

if ( -d $dir ) {

393

opendir DIR, $dir or die "could not open $dir for reading: $!";

394

@files = map { "$dir/$_";} grep { /\.gb.*/ } readdir DIR;

395

closedir DIR;

396

}

397

else {

398

die "$dir is not a directory\n";

399

}

400

}

401

else {

402

@files = @ARGV;

403

$dir = '';

404

}

405

406

# we should have some files by now

407

pod2usage(2) unless @files;

408

409

410

my $stdout=0; # dgg: let outdir == stdout == '-' for pipe use

411

if($outdir && ($outdir eq '-' || $outdir eq 'stdout')) {

412

warn("std. output chosen: cannot split\n") if($split);

413

warn("std. output chosen: cannot zip\n") if($zip);

414

warn("std. output chosen: cannot nolump\n") if($nolump);

415

$stdout=1; $lump=1; $split= 0; $zip= 0; # unless we pipe stdout thru gzip

416

417

} elsif ( $outdir && !-e $outdir ) {

418

mkdir($outdir) or die "could not create directory $outdir: $!\n";

419

}

420

elsif ( !$outdir ) {

421

$outdir = $dir || '.';

422

}

423

424

$outdir .= '/' unless $outdir =~ m|/$|;

425

426

for my $file ( @files ) {

427

# dgg ; allow 'stdin' / '-' input ?

428

chomp $file;

429

die "$! $file" unless($stdin || -e $file);

430

print "# Input: $file\n" if($verbose);

431

432

my ($lump_fh, $lumpfa_fh, $outfile, $outfa);

433

if ($stdout) {

434

$lump_fh= *STDOUT; $lump="stdout$$";

435

$outfa= "stdout$$.fa"; # this is a temp file ... see below

436

open $lumpfa_fh, ">$outfa" or die "Could not create a lump outfile called ($outfa) because ($!)\n";

437

438

} elsif ( $lump ) {

439

# this is better, but still should use catfile

440

my ($vol,$dirs,$fileonly) = File::Spec->splitpath($file);

441

$lump = $outdir . $fileonly . '.gff';

442

($outfa= $lump) =~ s/\.gff/\.fa/;

443

open $lump_fh, ">$lump" or die "Could not create a lump outfile called ($lump) because ($!)\n";

444

open $lumpfa_fh, ">$outfa" or die "Could not create a lump outfile called ($outfa) because ($!)\n";

445

446

}

447

448

# open input file, unzip if req'd

449

if ($stdin) {

450

*FH= *STDIN;

451

} elsif ( $file =~ /\.gz/ ) {

452

open FH, "gunzip -c $file |";

453

}

454

else {

455

open FH, "<$file";

456

}

457

458

my $in = Bio::SeqIO->new(-fh => \*FH, -format => $FORMAT, -debug=>$DEBUG);

459

my $gffio = Bio::Tools::GFF->new( -noparse => 1, -gff_version => $GFF_VERSION );

460

461

while ( my $seq = $in->next_seq() ) {

462

my $seq_name = $seq->accession_number;

463

my $end = $seq->length;

464

my @to_print;

465

466

# arrange disposition of GFF output

467

$outfile = $lump || $outdir . $seq_name . ".gff";

468

my $out;

469

470

if ( $lump ) {

471

$outfile = $lump;

472

$out = $lump_fh;

473

}

474

else {

475

$outfile = $outdir . $seq_name . ".gff";

476

open $out, ">$outfile";

477

}

478

479

# filter out unwanted features

480

my $source_feat= undef;

481

my @source= filter($seq); $source_feat= $source[0];

482

483

($source_type,$source_feat)=

484

getSourceInfo( $seq, $source_type, $source_feat ) ;

485

# always; here we build main prot $source_feat; # if @source;

486

487

# abort if there are no features

488

warn "$seq_name has no features, skipping\n" and next

489

if !$seq->all_SeqFeatures;

490

491

492

$FTSOmap->{'source'}= $source_type;

493

## $FTSOmap->{'CDS'}= $PROTEIN_TYPE; # handle this in gene_features

494

495

# construct a GFF header

496

# add: get source_type from attributes of source feature? chromosome=X tag

497

# also combine 1st ft line here with source ft from $seq ..

498

my($header,$info)= gff_header($seq_name, $end, $source_type, $source_feat);

499

print $out $header;

500

print "# working on $info\n" if($verbose);

501

502

# unflatten gene graphs, apply SO types, etc; this also does TypeMapper ..

503

unflatten_seq($seq);

504

505

506

# Note that we use our own get_all_SeqFeatures function

507

# to rescue cloned exons

508

for my $feature ( get_all_SeqFeatures($seq) ) {

509

510

my $method = $feature->primary_tag;

511

next if($SOURCEID =~/UniProt|swiss|trembl/i && $method ne $source_type);

512

513

$feature->seq_id($seq->id) unless($feature->seq_id);

514

$feature->source_tag($SOURCEID);

515

516

# dgg; need to convert some Genbank to GFF tags: note->Note; db_xref->Dbxref;

517

## also, pull any GO:000 ids from /note tag and put into Ontology_term

518

maptags2gff($feature);

519

520

# current gene name. The unflattened gene features should be in order so any

521

# exons, CDSs, etc that follow will belong to this gene

522

my $gene_name;

523

if ( $method eq 'gene' || $method eq 'pseudogene' ) {

524

@to_print= print_held($out, $gffio, \@to_print);

525

$gene_id = $gene_name= gene_name($feature);

526

} else {

527

$gene_name= gene_name($feature);

528

}

529

530

#?? should gene_name from /locus_tag,/gene,/product,/transposon=xxx

531

# be converted to or added as Name=xxx (if not ID= or as well)

532

## problematic: convert_to_name ($feature); # drops /locus_tag,/gene, tags

533

convert_to_name($feature);

534

535

## dgg: extended to protein|polypeptide

536

## this test ($feature->has_tag('gene') ||) is not good: repeat_regions over genes

537

## in yeast have that genbank tag; why?

538

## these include pseudogene ...

539

540

## Note we also have mapped types to SO, so these RNA's are now transcripts:

541

# pseudomRNA => "pseudogenic_transcript",

542

# pseudotranscript" => "pseudogenic_transcript",

543

# misc_RNA=>'processed_transcript',

544

545

warn "#at: $method $gene_id/$gene_name\n" if $DEBUG;

546

547

548

|| ( $gene_id && $gene_name eq $gene_id ) ) {

549

550

my $action = gene_features($feature, $gene_id, $gene_name); # -1, 0, 1, 2 result

551

if ($action == GM_DUP_PART) {

552

# ignore, this is dupl. exon with new parent ...

553

554

} elsif ($action == GM_NOT_PART) {

555

add_generic_id( $feature, $gene_name, "nocount");

556

my $gff = $gffio->gff_string($feature);

557

push @GFF_LINE_FEAT, $feature;

558

#print $out "$gff\n" if $gff;

559

560

} elsif ($action > 0) {

561

# hold off print because exon etc. may get 2nd, 3rd parents

562

@to_print= print_held($out, $gffio, \@to_print) if ($action == GM_NEW_TOPLEVEL);

563

push(@to_print, $feature);

564

}

565

566

}

567

568

# otherwise handle as generic feats with IDHandler labels

569

else {

570

add_generic_id( $feature, $gene_name, "");

571

my $gff= $gffio->gff_string($feature);

572

push @GFF_LINE_FEAT, $feature;

573

#print $out "$gff\n" if $gff;

574

}

575

}

576

577

# don't like doing this after others; do after each new gene id?

578

@to_print= print_held($out, $gffio, \@to_print);

579

580

gff_validate(@GFF_LINE_FEAT);

581

582

for my $feature (@GFF_LINE_FEAT) {

583

my $gff= $gffio->gff_string($feature);

584

print $out "$gff\n" if $gff;

585

}

586

587

# deal with the corresponding DNA

588

my ($fa_out,$fa_outfile);

589

my $dna = $seq->seq;

590

if($dna || %proteinfa) {

591

$method{'RESIDUES'} += length($dna);

592

$dna =~ s/(\S{60})/$1\n/g;

593

$dna .= "\n";

594

595

if ($split) {

596

$fa_outfile = $outfile;

597

$fa_outfile =~ s/gff$/fa/;

598

open $fa_out, ">$fa_outfile" or die $!;

599

print $fa_out ">$seq_name\n$dna" if $dna;

600

foreach my $aid (sort keys %proteinfa) {

601

my $aa= delete $proteinfa{$aid};

602

$method{'RESIDUES(tr)'} += length($aa);

603

$aa =~ s/(\S{60})/$1\n/g;

604

print $fa_out ">$aid\n$aa\n";

605

}

606

607

}

608

else {

609

## problem here when multiple GB Seqs in one file; all FASTA needs to go at end of $out

610

## see e.g. Mouse: mm_ref_chr19.gbk has NT_082868 and NT_039687 parts in one .gbk

611

## maybe write this to temp .fa then cat to end of lumped gff $out

612

print $lumpfa_fh ">$seq_name\n$dna" if $dna;

613

foreach my $aid (sort keys %proteinfa) {

614

my $aa= delete $proteinfa{$aid};

615

$method{'RESIDUES(tr)'} += length($aa);

616

$aa =~ s/(\S{60})/$1\n/g;

617

print $lumpfa_fh ">$aid\n$aa\n";

618

}

619

}

620

621

%proteinfa=();

622

}

623

624

if ( $zip && !$lump ) {

625

system "gzip -f $outfile";

626

system "gzip -f $fa_outfile" if($fa_outfile);

627

$outfile .= '.gz';

628

$fa_outfile .= '.gz' if $split;

629

}

630

631

# print "\n>EOF\n" if($stdout); #?? need this if summary goes to stdout after FASTA

632

print "# GFF3 saved to $outfile" unless( !$verbose || $stdout || $lump);

633

print ($split ? "; DNA saved to $fa_outfile\n" : "\n") unless($stdout|| $lump);

634

635

# dgg: moved to after all inputs; here it prints cumulative sum for each record

636

# if ( $summary ) {

637

# print "# Summary:\n# Feature\tCount\n# -------\t-----\n";

638

639

# for ( keys %method ) {

640

# print "# $_ $method{$_}\n";

641

# }

642

# print "# \n";

643

# }

644

645

}

646

647

print "# GFF3 saved to $outfile\n" if( $verbose && $lump);

648

if ( $summary ) {

649

print "# Summary:\n# Feature\tCount\n# -------\t-----\n";

650

651

for ( keys %method ) {

652

print "# $_ $method{$_}\n";

653

}

654

print "# \n";

655

}

656

657

## FIXME for piped output w/ split FA files ...

658

close($lumpfa_fh) if $lumpfa_fh;

659

if (!$split && $outfa && $lump_fh) {

660

print $lump_fh "##FASTA\n"; # GFF3 spec

661

open $lumpfa_fh, $outfa or warn "reading FA $outfa: $!";

662

while( <$lumpfa_fh>) { print $lump_fh $_; } # is $lump_fh still open?

663

close($lumpfa_fh); unlink($outfa);

664

}

665

666

667

if ( $zip && $lump ) {

668

system "gzip -f $lump";

669

}

670

671

close FH;

672

}

673

674

675

676

677

678

sub typeorder {

679

return 1 if ($_[0] =~ /gene/);

680

return 2 if ($_[0] =~ /RNA|transcript/);

681

return 3 if ($_[0] =~ /protein|peptide/);

682

return 4 if ($_[0] =~ /exon|CDS/);

683

return 3; # default before exon (smallest part)

684

}

685

686

sub sort_by_feattype {

687

my($at,$bt)= ($a->primary_tag, $b->primary_tag);

688

return (typeorder($at) <=> typeorder($bt))

689

or ($at cmp $bt);

690

## or ($a->name() cmp $b->name());

691

}

692

693

sub print_held {

694

my($out,$gffio,$to_print)= @_;

695

return unless(@$to_print);

696

@$to_print = sort sort_by_feattype @$to_print; # put exons after mRNA, otherwise chado loader chokes

697

while ( my $feature = shift @$to_print) {

698

my $gff= $gffio->gff_string($feature); # $gff =~ s/\'/./g; # dang bug in encode

699

push @GFF_LINE_FEAT, $feature;

700

#print $out "$gff\n";

701

}

702

return (); # @to_print

703

}

704

705

sub maptags2gff {

706

my $f = shift;

707

## should copy/move locus_tag to Alias, if not ID/Name/Alias already

708

# but see below /gene /locus_tag usage

709

foreach my $tag (keys %TAG_MAP) {

710

if ($f->has_tag($tag)) {

711

my $newtag= $TAG_MAP{$tag};

712

my @v= $f->get_tag_values($tag);

713

$f->remove_tag($tag);

714

$f->add_tag_value($newtag,@v);

715

716

## also, pull any GO:000 ids from /note tag and put into Ontology_term

717

## ncbi syntax in CDS /note is now '[goid GO:0005886]' OR '[goid 0005624]'

718

if ($tag eq 'note') {

719

map { s/\[goid (\d+)/\[goid GO:$1/g; } @v;

720

my @go= map { m/(GO:\d+)/g } @v;

721

$f->add_tag_value('Ontology_term',@go) if(@go);

722

}

723

724

}

725

}

726

}

727

728

729

sub getSourceInfo {

730

my ($seq, $source_type, $sf) = @_;

731

732

my $is_swiss= ($SOURCEID =~/UniProt|swiss|trembl/i);

733

my $is_gene = ($SOURCEID =~/entrezgene/i);

734

my $is_rich = (ref($seq) =~ /RichSeq/);

735

my $seq_name= $seq->accession_number();

736

737

unless($sf) { # make one

738

$source_type= $is_swiss ? $PROTEIN_TYPE

739

: $is_gene ? "eneg" # "gene" # "region" #

740

: $is_rich ? $seq->molecule : $source_type;

741

$sf = Bio::SeqFeature::Generic->direct_new();

742

743

my $len = $seq->length(); $len=1 if($len<1); my $start = 1; ##$start= $len if ($len<1);

744

my $loc= $seq->can('location') ? $seq->location()

745

: new Bio::Location::Simple( -start => $start, -end => $len);

746

$sf->location( $loc );

747

$sf->primary_tag($source_type);

748

$sf->source_tag($SOURCEID);

749

$sf->seq_id( $seq_name);

750

#? $sf->display_name($seq->id()); ## Name or Alias ?

751

$sf->add_tag_value( Alias => $seq->id()); # unless id == accession

752

$seq->add_SeqFeature($sf);

753

## $source_feat= $sf;

754

}

755

756

if ($sf->has_tag("chromosome")) {

757

$source_type= "chromosome";

758

my ($chrname) = $sf->get_tag_values("chromosome");

759

## PROBLEM with Name <> ID, RefName for Gbrowse; use Alias instead

760

## e.g. Mouse chr 19 has two IDs in NCBI genbank now

761

$sf->add_tag_value( Alias => $chrname );

762

}

763

764

# pull GB Comment, Description for source ft ...

765

# add reference - can be long, not plain string...

766

warn "# $SOURCEID:$seq_name fields = ", join(",", $seq->annotation->get_all_annotation_keys()),"\n" if $DEBUG;

767

# GenBank fields: keyword,comment,reference,date_changed

768

# Entrezgene fields 850293 =ALIAS_SYMBOL,RefSeq status,chromosome,SGD,dblink,Entrez Gene Status,OntologyTerm,LOCUS_SYNONYM

769

770

# is this just for main $seq object or for all seqfeatures ?

771

my %AnnotTagMap= (

772

'gene_name' => 'Alias',

773

'ALIAS_SYMBOL' => 'Alias', # Entrezgene

774

'LOCUS_SYNONYM' => 'Alias', #?

775

'symbol' => 'Alias',

776

'synonym' => 'Alias',

777

'dblink' => 'Dbxref',

778

'product' => 'product',

779

'Reference' => 'reference',

780

'OntologyTerm' => 'Ontology_term',

781

'comment' => 'Note',

782

'comment1' => 'Note',

783

# various map-type locations

784

# gene accession tag is named per source db !??

785

# 'Index terms' => keywords ??

786

);

787

788

789

my ($desc)= $seq->annotation->get_Annotations("desc") || ( $seq->desc() );

790

my ($date)= $seq->annotation->get_Annotations("dates")

791

|| $seq->annotation->get_Annotations("update-date")

792

|| $is_rich ? $seq->get_dates() : ();

793

my ($comment)= $seq->annotation->get_Annotations("comment");

794

my ($species)= $seq->annotation->get_Annotations("species");

795

if (!$species

796

&& $seq->can('species')

797

&& defined $seq->species()

798

&& $seq->species()->can('binomial') ) {

799

$species= $seq->species()->binomial();

800

}

801

802

# update source feature with main GB fields

803

$sf->add_tag_value( ID => $seq_name ) unless $sf->has_tag('ID');

804

$sf->add_tag_value( Note => $desc ) if($desc && ! $sf->has_tag('Note'));

805

$sf->add_tag_value( organism => $species ) if($species && ! $sf->has_tag('organism'));

806

$sf->add_tag_value( comment1 => $comment ) if(!$is_swiss && $comment && ! $sf->has_tag('comment1'));

807

$sf->add_tag_value( date => $date ) if($date && ! $sf->has_tag('date'));

808

809

$sf->add_tag_value( Dbxref => $SOURCEID.':'.$seq_name ) if $is_swiss || $is_gene;

810

811

foreach my $atag (sort keys %AnnotTagMap) {

812

my $gtag= $AnnotTagMap{$atag}; next unless($gtag);

813

my @anno = map{

814

if (ref $_ && $_->can('get_all_values')) {

815

split( /[,;] */, join ";", $_->get_all_values)

816

}

817

elsif (ref $_ && $_->can('display_text')) {

818

split( /[,;] */, $_->display_text)

819

}

820

elsif (ref $_ && $_->can('value')) {

821

split( /[,;] */, $_->value)

822

}

823

else {

824

();

825

}

826

} $seq->annotation->get_Annotations($atag);

827

foreach(@anno) { $sf->add_tag_value( $gtag => $_ ); }

828

}

829

830

# my @genes = map{ split( /[,;] */, "$_"); } $seq->annotation->get_Annotations('gene_name');

831

# $sf->add_tag_value( Alias => $_ ) foreach(@genes);

832

833

# my @dblink= map { "$_"; } $seq->annotation->get_Annotations("dblink"); # add @all

834

# $sf->add_tag_value( Dbxref => $_ ) foreach(@dblink);

835

836

837

return (wantarray)? ($source_type,$sf) : $source_type; #?

838

}

839

840

841

sub gene_features {

842

my ($f, $gene_id, $genelinkID) = @_;

843

local $_ = $f->primary_tag;

844

$method{$_}++;

845

846

if ( /gene/ ) {

847

$f->add_tag_value( ID => $gene_id ) unless($f->has_tag('ID')); # check is same value!?

848

$tnum = $rnum= 0; $ncrna_id= $rna_id = '';

849

return GM_NEW_TOPLEVEL;

850

851

} elsif ( /mRNA/ ) {

852

return GM_NOT_PART unless $gene_id;

853

return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);

854

($rna_id = $gene_id ) =~ s/gene/mRNA/;

855

$rna_id .= '.t0' . ++$tnum;

856

$f->add_tag_value( ID => $rna_id );

857

$f->add_tag_value( Parent => $gene_id );

858

859

} elsif ( /RNA|transcript/) {

860

## misc_RNA here; missing exons ... flattener problem?

861

# all of {t,nc,sn}RNA can have gene models now

862

## but problem in Worm chr: mRNA > misc_RNA > CDS with same locus tag

863

## CDS needs to use mRNA, not misc_RNA, rna_id ...

864

## also need to fix cases where tRNA,... lack a 'gene' parent: make this one top-level

865

866

if($gene_id) {

867

return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);

868

($ncrna_id = $gene_id) =~ s/gene/ncRNA/;

869

$ncrna_id .= '.r0' . ++$rnum;

870

$f->add_tag_value( Parent => $gene_id );

871

$f->add_tag_value( ID => $ncrna_id );

872

} else {

873

unless ($f->has_tag('ID')) {

874

if($genelinkID) {

875

$f->add_tag_value( ID => $genelinkID ) ;

876

} else {

877

$idh->generate_unique_persistent_id($f);

878

}

879

}

880

($ncrna_id)= $f->get_tag_values('ID');

881

return GM_NEW_TOPLEVEL;

882

# this feat now acts as gene-top-level; need to print @to_print to flush prior exons?

883

}

884

885

} elsif ( /exon/ ) { # can belong to any kind of RNA

886

return GM_NOT_PART unless ($rna_id||$ncrna_id);

887

return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);

888

## we are getting duplicate Parents here, which chokes chado loader, with reason...

889

## problem is when mRNA and ncRNA have same exons, both ids are active, called twice

890

## check all Parents

891

for my $expar ($rna_id, $ncrna_id) {

892

next unless($expar);

893

if ( $exonpar{$expar} and $f->has_tag('Parent') ) {

894

my @vals = $f->get_tag_values('Parent');

895

next if (grep {$expar eq $_} @vals);

896

}

897

$exonpar{$expar}++;

898

$f->add_tag_value( Parent => $expar);

899

# last; #? could be both

900

}

901

# now we can skip cloned exons

902

# dgg note: multiple parents get added and printed for each unique exon

903

return GM_DUP_PART if ++$seen{$f} > 1;

904

905

} elsif ( /CDS|protein|polypeptide/ ) {

906

return GM_NOT_PART unless $rna_id; ## ignore $ncrna_id ??

907

return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id); #??

908

(my $pro_id = $rna_id) =~ s/\.t/\.p/;

909

910

if( ! $CDSkeep && /CDS/) {

911

$f->primary_tag($PROTEIN_TYPE);

912

913

## duplicate problem is Location ..

914

if ($f->location->isa("Bio::Location::SplitLocationI")) {

915

# my($b,$e)=($f->start, $f->end); # is this all we need?

916

my($b,$e)=(-1,0);

917

foreach my $l ($f->location->each_Location) {

918

$b = $l->start if($b<0 || $b > $l->start);

919

$e = $l->end if($e < $l->end);

920

}

921

$f->location( Bio::Location::Simple->new(

922

-start => $b, -end => $e, -strand => $f->strand) );

923

}

924

925

$f->add_tag_value( Derives_from => $rna_id );

926

}

927

else {

928

$f->add_tag_value( Parent => $rna_id );

929

}

930

931

$f->add_tag_value( ID => $pro_id );

932

933

move_translation_fasta($f, $pro_id);

934

# if( $f->has_tag('translation')) {

935

# my ($aa) = $f->get_tag_values("translation");

936

# $proteinfa{$pro_id}= $aa;

937

# $f->remove_tag("translation");

938

# $f->add_tag_value("translation","length.".length($aa)); # hack for odd chado gbl problem

939

# }

940

} elsif ( /region/ ) {

941

$f->primary_tag('gene_component_region');

942

$f->add_tag_value( Parent => $gene_id );

943

} else {

944

return GM_NOT_PART unless $gene_id;

945

$f->add_tag_value( Parent => $gene_id );

946

}

947

948

## return GM_DUP_PART if /exon/ && ++$seen{$f} > 1;

949

950

return GM_NEW_PART;

951

}

952

953

## was generic_features > add_generic_id

954

sub add_generic_id {

955

my ($f, $ft_name, $flags) = @_;

956

my $method = $f->primary_tag;

957

$method{$method}++ unless($flags =~ /nocount/); ## double counts GM_NOT_PART from above

958

959

if ($f->has_tag('ID')) {

960

961

}

962

elsif ( $f->has_tag($method) ) {

963

my ($name) = $f->get_tag_values($method);

964

$f->add_tag_value( ID => "$method:$name" );

965

}

966

elsif($ft_name) { # is this unique ?

967

$f->add_tag_value( ID => $ft_name );

968

}

969

else {

970

$idh->generate_unique_persistent_id($f);

971

}

972

973

move_translation_fasta( $f, ($f->get_tag_values("ID"))[0] )

974

if($method =~ /CDS/);

975

976

# return $io->gff_string($f);

977

}

978

979

sub move_translation_fasta {

980

my ($f, $ft_id) = @_;

981

if( $ft_id && $f->has_tag('translation') ) {

982

my ($aa) = $f->get_tag_values("translation");

983

if($aa && $aa !~ /^length/) {

984

$proteinfa{$ft_id}= $aa;

985

$f->remove_tag("translation");

986

$f->add_tag_value("translation","length.".length($aa)); # hack for odd chado gbl problem

987

}

988

}

989

}

990

991

sub gff_header {

992

my ($name, $end, $source_type, $source_feat) = @_;

993

$source_type ||= "region";

994

995

my $info= "$source_type:$name";

996

my $head= "##gff-version $GFF_VERSION\n# sequence-region $name 1 $end\n";

997

$head .= "# conversion-by bp_genbank2gff3.pl\n";

998

if($source_feat) {

999

## dgg: these header comment fields are not useful when have multi-records, diff organisms

1000

for my $key (qw(organism date Note)) {

1001

my($value) = $source_feat->get_tag_values($key);

1002

$head .= "# $key $value\n" if($value);

1003

$info .= ", $value" if($value);

1004

}

1005

$head="" if($didheader);

1006

} else {

1007

$head .= "$name\t$SOURCEID\t$source_type\t1\t$end\t.\t.\t.\tID=$name\n";

1008

}

1009

$didheader++;

1010

return (wantarray) ? ($head,$info) : $head;

1011

}

1012

1013

sub unflatten_seq {

1014

my $seq = shift;

1015

1016

## print "# working on $source_type:", $seq->accession, "\n";

1017

my $uh_oh = "Possible gene unflattening error with" . $seq->accession_number .

1018

": consult STDERR\n";

1019

1020

eval {

1021

$unflattener->unflatten_seq( -seq => $seq,

1022

-noinfer => $noinfer,

1023

-use_magic => 1 );

1024

};

1025

1026

# deal with unflattening errors

1027

if ( $@ ) {

1028

warn $seq->accession_number . " Unflattening error:\n";

1029

warn "Details: $@\n";

1030

print "# ".$uh_oh;

1031

}

1032

1033

return 0 if !$seq || !$seq->all_SeqFeatures;

1034

1035

# map feature types to the sequence ontology

1036

## $tm->map_types_to_SO( -seq => $seq );

1037

#$tm->map_types( -seq => $seq, -type_map => $FTSOmap, -undefined => "region" ); #dgg

1038

1039

map_types(

1040

$tm,

1041

-seq => $seq,

1042

-type_map => $FTSOmap,

1043

-syn_map => $FTSOsynonyms,

1044

-undefined => "region"

1045

); #nml

1046

1047

}

1048

1049

sub filter {

1050

my $seq = shift;

1051

## return unless $filter;

1052

my @feats;

1053

my @sources; # dgg; pick source features here; only 1 always?

1054

if ($filter) {

1055

for my $f ( $seq->remove_SeqFeatures ) {

1056

my $m = $f->primary_tag;

1057

push @sources, $f if ($m eq 'source'); # dgg? but leave in @feats ?

1058

push @feats, $f unless $filter =~ /$m/i;

1059

}

1060

$seq->add_SeqFeature(@feats) if @feats;

1061

} else {

1062

for my $f ( $seq->get_SeqFeatures ){

1063

my $m = $f->primary_tag;

1064

push @sources, $f if ($m eq 'source'); # dgg? but leave in @feats ?

1065

}

1066

}

1067

1068

return @sources;

1069

}

1070

1071

1072

# The default behaviour of Bio::FeatureHolderI:get_all_SeqFeatures

1073

# changed to filter out cloned features. We have to implement the old

1074

# method. These two subroutines were adapted from the v1.4 Bio::FeatureHolderI

1075

sub get_all_SeqFeatures {

1076

my $seq = shift;

1077

my @flatarr;

1078

1079

foreach my $feat ( $seq->get_SeqFeatures ){

1080

push(@flatarr,$feat);

1081

_add_flattened_SeqFeatures(\@flatarr,$feat);

1082

}

1083

return @flatarr;

1084

}

1085

1086

sub gene_name {

1087

my $g = shift;

1088

my $gene_id = ''; # zero it;

1089

1090

if ($g->has_tag('locus_tag')) {

1091

($gene_id) = $g->get_tag_values('locus_tag');

1092

}

1093

1094

elsif ($g->has_tag('gene')) {

1095

($gene_id) = $g->get_tag_values('gene');

1096

}

1097

elsif ($g->has_tag('ID')) { # for non-Genbank > Entrezgene

1098

($gene_id) = $g->get_tag_values('ID');

1099

}

1100

1101

## See Unflattener comment:

1102

# on rare occasions, records will have no /gene or /locus_tag

1103

# but it WILL have /product tags. These serve the same purpose

1104

# for grouping. For an example, see AY763288 (also in t/data)

1105

# eg. product=tRNA-Asp ; product=similar to crooked neck protein

1106

elsif ($g->has_tag('product')) {

1107

my ($name)= $g->get_tag_values('product');

1108

($gene_id) = $name unless($name =~ / /); # a description not name

1109

}

1110

1111

## dgg; also handle transposon=xxxx ID/name

1112

# ID=GenBank:repeat_region:NC_004353:1278337:1281302;transposon=HeT-A{}1685;Dbxref=FLYBASE:FBti0059746

1113

elsif ($g->has_tag('transposon')) {

1114

my ($name)= $g->get_tag_values('transposon');

1115

($gene_id) = $name unless($name =~ / /); # a description not name

1116

}

1117

1118

return $gene_id;

1119

}

1120

1121

# same list as gene_name .. change tag to generic Name

1122

sub convert_to_name {

1123

my $g = shift;

1124

my $gene_id = ''; # zero it;

1125

1126

if ($g->has_tag('gene')) {

1127

($gene_id) = $g->get_tag_values('gene');

1128

$g->remove_tag('gene');

1129

$g->add_tag_value('Name', $gene_id);

1130

}

1131

elsif ($g->has_tag('locus_tag')) {

1132

($gene_id) = $g->get_tag_values('locus_tag');

1133

$g->remove_tag('locus_tag');

1134

$g->add_tag_value('Name', $gene_id);

1135

}

1136

1137

elsif ($g->has_tag('product')) {

1138

my ($name)= $g->get_tag_values('product');

1139

($gene_id) = $name unless($name =~ / /); # a description not name

1140

## $g->remove_tag('product');

1141

$g->add_tag_value('Name', $gene_id);

1142

}

1143

1144

elsif ($g->has_tag('transposon')) {

1145

my ($name)= $g->get_tag_values('transposon');

1146

($gene_id) = $name unless($name =~ / /); # a description not name

1147

## $g->remove_tag('transposon');

1148

$g->add_tag_value('Name', $gene_id);

1149

}

1150

elsif ($g->has_tag('ID')) {

1151

my ($name)= $g->get_tag_values('ID');

1152

$g->add_tag_value('Name', $name);

1153

}

1154

return $gene_id;

1155

}

1156

1157

1158

sub _add_flattened_SeqFeatures {

1159

my ($arrayref,$feat) = @_;

1160

my @subs = ();

1161

1162

if ($feat->isa("Bio::FeatureHolderI")) {

1163

@subs = $feat->get_SeqFeatures;

1164

}

1165

elsif ($feat->isa("Bio::SeqFeatureI")) {

1166

@subs = $feat->sub_SeqFeature;

1167

}

1168

else {

1169

warn ref($feat)." is neither a FeatureHolderI nor a SeqFeatureI. ".

1170

"Don't know how to flatten.";

1171

}

1172

1173

for my $sub (@subs) {

1174

push(@$arrayref,$sub);

1175

_add_flattened_SeqFeatures($arrayref,$sub);

1176

}

1177

1178

}

1179

1180

sub map_types {

1181

1182

my ($self, @args) = @_;

1183

1184

my($sf, $seq, $type_map, $syn_map, $undefmap) =

1185

$self->_rearrange([qw(FEATURE

1186

SEQ

1187

TYPE_MAP

1188

SYN_MAP

1189

UNDEFINED

1190

)],

1191

@args);

1192

1193

if (!$sf && !$seq) {

1194

$self->throw("you need to pass in either -feature or -seq");

1195

}

1196

1197

my @sfs = ($sf);

1198

if ($seq) {

1199

$seq->isa("Bio::SeqI") || $self->throw("$seq NOT A SeqI");

1200

@sfs = $seq->get_all_SeqFeatures;

1201

}

1202

$type_map = $type_map || $self->typemap; # dgg: was type_map;

1203

foreach my $feat (@sfs) {

1204

1205

$feat->isa("Bio::SeqFeatureI") || $self->throw("$feat NOT A SeqFeatureI");

1206

$feat->isa("Bio::FeatureHolderI") || $self->throw("$feat NOT A FeatureHolderI");

1207

1208

my $primary_tag = $feat->primary_tag;

1209

1210

#if ($primary_tag =~ /^pseudo(.*)$/) {

1211

#$primary_tag = $1;

1212

#$feat->primary_tag($primary_tag);

1213

1214

1215

my $mtype = $type_map->{$primary_tag};

1216

if ($mtype) {

1217

if (ref($mtype)) {

1218

if (ref($mtype) eq 'ARRAY') {

1219

my $soID;

1220

($mtype, $soID) = @$mtype;

1221

1222

if ($soID && ref($ONTOLOGY)) {

1223

my ($term) = $ONTOLOGY->find_terms(-identifier => $soID);

1224

$mtype = $term->name if $term;

1225

}

1226

# if SO ID is undefined AND we have an ontology to search, we want to delete

1227

# the feature type hash entry in order to force a fuzzy search

1228

elsif (! defined $soID && ref($ONTOLOGY)) {

1229

undef $mtype;

1230

delete $type_map->{$primary_tag};

1231

}

1232

elsif ($undefmap && $mtype eq 'undefined') { # dgg

1233

$mtype= $undefmap;

1234

}

1235

1236

$type_map->{$primary_tag} = $mtype if $mtype;

1237

}

1238

elsif (ref($mtype) eq 'CODE') {

1239

$mtype = $mtype->($feat);

1240

}

1241

else {

1242

$self->throw('must be scalar or CODE ref');

1243

}

1244

}

1245

elsif ($undefmap && $mtype eq 'undefined') { # dgg

1246

$mtype= $undefmap;

1247

}

1248

$feat->primary_tag($mtype);

1249

}

1250

1251

if ($CONF) {

1252

conf_read();

1253

my %perfect_matches;

1254

while (my ($p_tag,$rules) = each %$YAML) {

1255

RULE:

1256

for my $rule (@$rules) {

1257

for my $tags (@$rule) {

1258

while (my ($tag,$values) = each %$tags) {

1259

for my $value (@$values) {

1260

if ($feat->has_tag($tag)) {

1261

for ($feat->get_tag_values($tag)) {

1262

next RULE unless $_ =~ /\Q$value\E/;

1263

}

1264

} elsif ($tag eq 'primary_tag') {

1265

next RULE unless $value eq

1266

$feat->primary_tag;

1267

} elsif ($tag eq 'location') {

1268

next RULE unless $value eq

1269

$feat->start.'..'.$feat->end;

1270

} else { next RULE }

1271

}

1272

}

1273

}

1274

$perfect_matches{$p_tag}++;

1275

}

1276

}

1277

if (scalar(keys %perfect_matches) == 1) {

1278

$mtype = $_ for keys %perfect_matches;

1279

} elsif (scalar(keys %perfect_matches) > 1) {

1280

warn "There are conflicting rules in the config file for the" .

1281

" following types: ";

1282

warn "\t$_\n" for keys %perfect_matches;

1283

warn "Until conflict resolution is built into the converter," .

1284

" you will have to manually edit the config file to remove the" .

1285

" conflict. Sorry :(. Skipping user preference for this entry";

1286

sleep(2);

1287

}

1288

}

1289

1290

if ( ! $mtype && $syn_map) {

1291

if ($feat->has_tag('note')) {

1292

1293

my @all_matches;

1294

1295

my @note = $feat->each_tag_value('note');

1296

1297

for my $k (keys %$syn_map) {

1298

1299

if ($k =~ /"(.+)"/) {

1300

1301

my $syn = $1;

1302

1303

for my $note (@note) {

1304

1305

# look through the notes to see if the description

1306

# is an exact match for synonyms

1307

if ( $syn eq $note ) {

1308

1309

my @map = @{$syn_map->{$k}};

1310

1311

1312

my $best_guess = $map[0];

1313

1314

unshift @{$all_matches[-1]}, [$best_guess];

1315

1316

$mtype = $MANUAL

1317

? manual_curation($feat, $best_guess, \@all_matches)

1318

: $best_guess;

1319

1320

print '#' x 78 . "\nGuessing the proper SO term for GenBank"

1321

. " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n"

1322

. '#' x 78 . "\n\n";

1323

1324

} else {

1325

# check both primary tag and and note against

1326

# SO synonyms for best matching description

1327

1328

SO_fuzzy_match( $k, $primary_tag, $note, $syn, \@all_matches);

1329

}

1330

1331

}

1332

}

1333

}

1334

1335

#unless ($mtype) {

1336

for my $note (@note) {

1337

for my $name (values %$type_map) {

1338

# check primary tag against SO names for best matching

1339

# descriptions //NML also need to check against

1340

# definition && camel case split terms

1341

1342

SO_fuzzy_match($name, $primary_tag, $note, $name, \@all_matches);

1343

}

1344

}

1345

1346

1347

if (scalar(@all_matches) && !$mtype) {

1348

1349

my $top_matches = first { defined $_ } @{$all_matches[-1]};

1350

1351

my $best_guess = $top_matches->[0];

1352

1353

1354

1355

# if guess has quotes, it is a synonym term. we need to

1356

# look up the corresponding name term

1357

# otherwise, guess is a name, so we can use it directly

1358

if ($best_guess =~ /"(.+)"/) {

1359

1360

$best_guess = $syn_map->{$best_guess}->[0];

1361

1362

}

1363

1364

@RETURN = @all_matches;

1365

$mtype = $MANUAL

1366

? manual_curation($feat, $best_guess, \@all_matches)

1367

: $best_guess;

1368

1369

print '#' x 78 . "\nGuessing the proper SO term for GenBank"

1370

. " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n"

1371

. '#' x 78 . "\n\n";

1372

1373

}

1374

}

1375

$mtype ||= $undefmap;

1376

$feat->primary_tag($mtype);

1377

}

1378

}

1379

1380

1381

}

1382

1383

sub SO_fuzzy_match {

1384

1385

my $candidate = shift;

1386

my $primary_tag = shift;

1387

my $note = shift;

1388

my $SO_terms = shift;

1389

my $best_matches_ref = shift;

1390

my $modifier = shift;

1391

1392

$modifier ||= '';

1393

1394

my @feat_terms;

1395

1396

for ( split(" |_", $primary_tag) ) {

1397

#my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g;

1398

my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g;

1399

push @feat_terms, @camelCase;

1400

}

1401

1402

for ( split(" |_", $note) ) {

1403

#my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g;

1404

#my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g;

1405

(my $word = $_) =~ s/[;:.,]//g;

1406

push @feat_terms, $word;

1407

}

1408

1409

1410

my @SO_terms = split(" |_", $SO_terms);

1411

1412

# fuzzy match works on a simple point system. When 2 words match,

1413

# the $plus counter adds one. When they don't, the $minus counter adds

1414

# one. This is used to sort similar matches together. Better matches

1415

# are found at the end of the array, near the top.

1416

1417

# NML: can we improve best match by using synonym tags

1418

# EXACT,RELATED,NARROW,BROAD?

1419

1420

my ($plus, $minus) = (0, 0);

1421

my %feat_terms;

1422

my %SO_terms;

1423

1424

#unique terms

1425

map {$feat_terms{$_} = 1} @feat_terms;

1426

map {$SO_terms{$_} = 1} @SO_terms;

1427

1428

for my $st (keys %SO_terms) {

1429

for my $ft (keys %feat_terms) {

1430

1431

($st =~ m/$modifier\Q$ft\E/) ? $plus++ : $minus++;

1432

1433

}

1434

}

1435

1436

push @{$$best_matches_ref[$plus][$minus]}, $candidate if $plus;

1437

1438

}

1439

1440

sub manual_curation {

1441

1442

my ($feat, $default_opt, $all_matches) = @_;

1443

1444

my @all_matches = @$all_matches;

1445

1446

# convert all SO synonyms into names and filter

1447

# all matches into unique term list because

1448

# synonyms can map to multiple duplicate names

1449

1450

my (@unique_SO_terms, %seen);

1451

for (reverse @all_matches) {

1452

for (@$_) {

1453

for (@$_) {

1454

#my @names;

1455

if ($_ =~ /"(.+)"/) {

1456

for (@{$SYN_MAP->{$_}}) {

1457

push @unique_SO_terms, $_ unless $seen{$_};

1458

$seen{$_}++;

1459

}

1460

} else {

1461

push @unique_SO_terms, $_ unless $seen{$_};

1462

$seen{$_}++;

1463

}

1464

}

1465

}

1466

}

1467

1468

my $s = scalar(@unique_SO_terms);

1469

1470

my $choice = 0;

1471

1472

my $more =

1473

"[a]uto : automatic input (selects best guess for remaining entries)\r" .

1474

"[f]ind : search for other SO terms matching your query (e.g. f gene)\r" .

1475

"[i]nput : add a specific term\r" .

1476

"[r]eset : reset to the beginning of matches\r" .

1477

"[s]kip : skip this entry (selects best guess for this entry)\r"

1478

;

1479

1480

$more .=

1481

"[n]ext : view the next ".OPTION_CYCLE." terms\r" .

1482

"[p]rev : view the previous ".OPTION_CYCLE." terms" if ($s > OPTION_CYCLE);

1483

1484

my $msg = #"\n\n" . '-' x 156 . "\n"

1485

"The converter found $s possible matches for the following GenBank entry: ";

1486

1487

my $directions =

1488

"Type a number to select the SO term that best matches"

1489

. " the genbank entry, or use any of the following options:\r" . '_' x 76 . "\r$more";

1490

1491

1492

# lookup filtered list to pull out definitions

1493

my @options = map {

1494

my $term = $_;

1495

my %term;

1496

for (['name', 'name'], ['def', 'definition'], ['synonym',

1497

'each_synonym']) {

1498

my ($label, $method) = @$_;

1499

$term{$label} = \@{[$term->$method]};

1500

}

1501

[++$choice, $_->name, ($_->definition || 'none'), \%term, $_->each_synonym ];

1502

} map { $ONTOLOGY->find_terms(-name => $_) } @unique_SO_terms;

1503

1504

1505

my $option = options_cycle(0, OPTION_CYCLE, $msg, $feat, $directions,

1506

$default_opt, @options);

1507

1508

if ($option eq 'skip') { return $default_opt

1509

} elsif ($option eq 'auto') {

1510

$MANUAL = 0;

1511

return $default_opt;

1512

} else { return $option }

1513

1514

}

1515

1516

sub options_cycle {

1517

1518

my ($start, $stop, $msg, $feat, $directions, $best_guess, @opt) = @_;

1519

1520

#NML: really should only call GenBank_entry once. Will need to change

1521

#method to return array & shift off header

1522

my $entry = GenBank_entry($feat, "\r");

1523

1524

my $total = scalar(@opt);

1525

1526

($start,$stop) = (0, OPTION_CYCLE)

1527

if ( ($start < 0) && ($stop > 0) );

1528

1529

($start,$stop) = (0, OPTION_CYCLE)

1530

if ( ( ($stop - $start) < OPTION_CYCLE ) && $stop < $total);

1531

1532

($start,$stop) = ($total - OPTION_CYCLE, $total) if $start < 0;

1533

($start,$stop) = (0, OPTION_CYCLE) if $start >= $total;

1534

1535

$stop = $total if $stop > $total;

1536

1537

my $dir_copy = $directions;

1538

my $msg_copy = $msg;

1539

my $format = "format STDOUT = \n" .

1540

'-' x 156 . "\n" .

1541

'^' . '<' x 77 . '| Available Commands:' . "\n" .

1542

'$msg_copy' . "\n" .

1543

'-' x 156 . "\n" .

1544

' ' x 78 . "|\n" .

1545

'^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" .

1546

'$entry' . ' ' x 74 . '$dir_copy,' . "\n" .

1547

(' ' x 20 . '^' . '<' x 57 . '| ^' . '<' x 75 . '~' . "\n" .

1548

' ' x 20 . '$entry,' . ' ' x 53 . '$dir_copy,' . "\n") x 1000 . ".\n";

1549

1550

{

1551

# eval throws redefined warning that breaks formatting.

1552

# Turning off warnings just for the eval to fix this.

1553

local $^W = 0;

1554

eval $format;

1555

}

1556

1557

write;

1558

1559

print '-' x 156 . "\n" .

1560

'Showing results ' . ( $stop ? ( $start + 1 ) : $start ) .

1561

" - $stop of possible SO term matches: (best guess is \"$best_guess\")" .

1562

"\n" . '-' x 156 . "\n";

1563

1564

for (my $i = $start; $i < $stop; $i+=2) {

1565

1566

my ($left, $right) = @opt[$i,$i+1];

1567

1568

my ($nL, $nmL, $descL, $termL, @synL) = @$left;

1569

1570

#odd numbered lists can cause fatal undefined errors, so check

1571

#to make sure we have data

1572

1573

my ($nR, $nmR, $descR, $termR, @synR) = ref($right) ? @$right : (undef, undef, undef);

1574

1575

1576

my $format = "format STDOUT = \n";

1577

1578

$format .=

1579

' ' x 78 . "|\n" .

1580

1581

'@>>: name: ^' . '<' x 64 . '~' . ' |' .

1582

( ref($right) ? ('@>>: name: ^' . '<' x 64 . '~' ) : '' ) . "\n" .

1583

'$nL,' . ' ' x 7 . '$nmL,' .

1584

( ref($right) ? (' ' x 63 . '$nR,' . ' ' x 7 . "\$nmR,") : '' ) . "\n" .

1585

1586

' ' x 11 . '^' . '<' x 61 . '...~' . ' |' .

1587

(ref($right) ? (' ^' . '<' x 61 . '...~') : '') . "\n" .

1588

' ' x 11 . '$nmL,' .

1589

(ref($right) ? (' ' x 74 . '$nmR,') : '') . "\n" .

1590

#' ' x 78 . '|' . "\n" .

1591

1592

1593

' def: ^' . '<' x 65 . ' |' .

1594

(ref($right) ? (' def: ^' . '<' x 64 . '~') : '') . "\n" .

1595

' ' x 11 . '$descL,' .

1596

(ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" .

1597

1598

1599

(' ^' . '<' x 65 . ' |' .

1600

(ref($right) ? (' ^' . '<' x 64 . '~') : '') . "\n" .

1601

' ' x 11 . '$descL,' .

1602

(ref($right) ? (' ' x 72 . '$descR,') : '') . "\n") x 5 .

1603

1604

1605

' ^' . '<' x 61 . '...~ |' .

1606

(ref($right) ? (' ^' . '<' x 61 . '...~') : '') . "\n" .

1607

' ' x 11 . '$descL,' .

1608

(ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" .

1609

1610

".\n";

1611

1612

{

1613

# eval throws redefined warning that breaks formatting.

1614

# Turning off warnings just for the eval to fix this.

1615

local $^W = 0;

1616

eval $format;

1617

}

1618

write;

1619

1620

}

1621

print '-' x 156 . "\nenter a command:";

1622

1623

while (<STDIN>) {

1624

1625

(my $input = $_) =~ s/\s+$//;

1626

1627

if ($input =~ /^\d+$/) {

1628

if ( $input && defined $opt[$input-1] ) {

1629

return $opt[$input-1]->[1]

1630

} else {

1631

print "\nThat number is not an option. Please enter a valid number.\n:";

1632

}

1633

} elsif ($input =~ /^n/i | $input =~ /next/i ) {

1634

return options_cycle($start + OPTION_CYCLE, $stop + OPTION_CYCLE, $msg,

1635

$feat, $directions, $best_guess, @opt)

1636

} elsif ($input =~ /^p/i | $input =~ /prev/i ) {

1637

return options_cycle($start - OPTION_CYCLE, $stop - OPTION_CYCLE, $msg,

1638

$feat, $directions, $best_guess, @opt)

1639

} elsif ( $input =~ /^s/i || $input =~ /skip/i ) { return 'skip'

1640

} elsif ( $input =~ /^a/i || $input =~ /auto/i ) { return 'auto'

1641

} elsif ( $input =~ /^r/i || $input =~ /reset/i ) {

1642

return manual_curation($feat, $best_guess, \@RETURN );

1643

} elsif ( $input =~ /^f/i || $input =~ /find/i ) {

1644

1645

my ($query, @query_results);

1646

1647

if ($input =~ /(?:^f|find)\s+?(.*?)$/) { $query = $1;

1648

} else {

1649

1650

#do a SO search

1651

print "Type your search query\n:";

1652

while (<STDIN>) { chomp($query = $_); last }

1653

}

1654

1655

for (keys(%$TYPE_MAP), keys(%$SYN_MAP)) {

1656

SO_fuzzy_match($_, $query, '', $_, \@query_results, '(?i)');

1657

}

1658

1659

return manual_curation($feat, $best_guess, \@query_results);

1660

1661

} elsif ( $input =~ /^i/i || $input =~ /input/i ) {

1662

1663

#NML fast input for later

1664

#my $query;

1665

#if ($input =~ /(?:^i|input)\s+?(.*?)$/) { $query = $1 };

1666

1667

#manual input

1668

print "Type the term you want to use\n:";

1669

while (<STDIN>) {

1670

chomp(my $input = $_);

1671

1672

if (! $TYPE_MAP->{$input}) {

1673

1674

print "\"$input\" doesn't appear to be a valid SO term. Are ".

1675

"you sure you want to use it? (y or n)\n:";

1676

1677

while (<STDIN>) {

1678

1679

chomp(my $choice = $_);

1680

1681

if ($choice eq 'y') {

1682

1683

"\nWould you like to save your preference for " .

1684

"future use (so you don't have to redo manual " .

1685

"curation for this feature everytime you run " .

1686

"the converter)? (y or n)\n";

1687

1688

#NML: all these while loops are a mess. Really should condense it.

1689

while (<STDIN>) {

1690

1691

chomp(my $choice = $_);

1692

1693

if ($choice eq 'y') {

1694

curation_save($feat, $input);

1695

return $input;

1696

} elsif ($choice eq 'n') {

1697

return $input

1698

} else {

1699

print "\nDidn't recognize that command. Please " .

1700

"type y or n.\n:"

1701

}

1702

}

1703

1704

1705

} elsif ($choice eq 'n') {

1706

return options_cycle($start, $stop, $msg, $feat,

1707

$directions, $best_guess, @opt)

1708

} else {

1709

print "\nDidn't recognize that command. Please " .

1710

"type y or n.\n:"

1711

}

1712

}

1713

1714

} else {

1715

1716

"\nWould you like to save your preference for " .

1717

"future use (so you don't have to redo manual " .

1718

"curation for this feature everytime you run " .

1719

"the converter)? (y or n)\n";

1720

1721

#NML: all these while loops are a mess. Really should condense it.

1722

while (<STDIN>) {

1723

1724

chomp(my $choice = $_);

1725

1726

if ($choice eq 'y') {

1727

curation_save($feat, $input);

1728

return $input;

1729

} elsif ($choice eq 'n') {

1730

return $input

1731

} else {

1732

print "\nDidn't recognize that command. Please " .

1733

"type y or n.\n:"

1734

}

1735

}

1736

1737

}

1738

1739

}

1740

} else {

1741

print "\nDidn't recognize that command. Please re-enter your choice.\n:"

1742

}

1743

}

1744

1745

}

1746

1747

sub GenBank_entry {

1748

my ($f, $delimiter, $num) = @_;

1749

1750

$delimiter ||= "\n";

1751

1752

1753

my $entry =

1754

1755

($num ? ' [1] ' : ' ' x 5) . $f->primary_tag

1756

. ($num

1757

? ' ' x (12 - length $f->primary_tag ) . ' [2] '

1758

: ' ' x (15 - length $f->primary_tag)

1759

)

1760

. $f->start.'..'.$f->end

1761

1762

. "$delimiter";

1763

1764

if ($num) {

1765

words_tag($f, \$entry);

1766

} else {

1767

for my $tag ($f->all_tags) {

1768

for my $val ( $f->each_tag_value($tag) ) {

1769

$entry .= ' ' x 20;

1770

#$entry .= "/$tag=\"$val\"$delimiter";

1771

$entry .= $val eq '_no_value'

1772

? "/$tag$delimiter"

1773

: "/$tag=\"$val\"$delimiter";

1774

}

1775

}

1776

1777

}

1778

1779

return $entry;

1780

}

1781

1782

1783

sub gff_validate {

1784

warn "Validating GFF file\n" if $DEBUG;

1785

my @feat = @_;

1786

1787

my (%parent2child, %all_ids, %descendants, %reserved);

1788

1789

for my $f (@feat) {

1790

for my $aTags (['Parent', \%parent2child], ['ID', \%all_ids]) {

1791

map { push @{$$aTags[1]->{$_}}, $f } $f->get_tag_values($$aTags[0])

1792

if $f->has_tag($$aTags[0]);

1793

}

1794

}

1795

1796

if ($SO_FILE) {

1797

while (my ($parentID, $aChildren) = each %parent2child) {

1798

parent_validate($parentID, $aChildren, \%all_ids, \%descendants, \%reserved);

1799

}

1800

}

1801

1802

id_validate(\%all_ids, \%reserved);

1803

}

1804

1805

sub parent_validate {

1806

my ($parentID, $aChildren, $hAll, $hDescendants, $hReserved) = @_;

1807

1808

my $aParents = $hAll->{$parentID};

1809

1810

map {

1811

my $child = $_;

1812

$child->add_tag_value( validation_error =>

1813

"feature tried to add Parent tag, but no Parent found with ID $parentID"

1814

);

1815

my %parents;

1816

map { $parents{$_} = 1 } $child->get_tag_values('Parent');

1817

delete $parents{$parentID};

1818

my @parents = keys %parents;

1819

1820

$child->remove_tag('Parent');

1821

1822

unless ($child->has_tag('ID')) {

1823

my $id = gene_name($child);

1824

$child->add_tag_value('ID', $id);

1825

push @{$hAll->{$id}}, $child

1826

}

1827

1828

$child->add_tag_value('Parent', @parents) if @parents;

1829

1830

} @$aChildren and return unless scalar(@$aParents);

1831

1832

my $par = join(',', map { $_->primary_tag } @$aParents);

1833

warn scalar(@$aParents)." POSSIBLE PARENT(S): $par" if $DEBUG;

1834

1835

#NML manual curation needs to happen here

1836

1837

1838

my %parentsToRemove;

1839

1840

CHILD:

1841

for my $child (@$aChildren) {

1842

my $childType = $child->primary_tag;

1843

1844

warn "WORKING ON $childType at ".$child->start.' to '.$child->end

1845

if $DEBUG;

1846

1847

for (my $i = 0; $i < scalar(@$aParents); $i++) {

1848

my $parent = $aParents->[$i];

1849

my $parentType = $parent->primary_tag;

1850

1851

warn "CHECKING $childType against $parentType" if $DEBUG;

1852

1853

#cache descendants so we don't have to do repeat searches

1854

unless ($hDescendants->{$parentType}) {

1855

1856

for my $term ($ONTOLOGY->find_terms(

1857

-name => $parentType

1858

) ) {

1859

1860

map {

1861

$hDescendants->{$parentType}{$_->name}++

1862

} $ONTOLOGY->get_descendant_terms($term);

1863

1864

}

1865

1866

# NML: hopefully temporary fix.

1867

# SO doesn't consider exon/CDS to be a child of mRNA

1868

# even though common knowledge dictates that they are

1869

# This cheat fixes the false positives for now

1870

if ($parentType eq 'mRNA') {

1871

$hDescendants->{$parentType}{'exon'} = 1;

1872

$hDescendants->{$parentType}{'CDS'} = 1;

1873

}

1874

1875

}

1876

1877

warn "\tCAN $childType at " . $child->start . ' to ' . $child->end .

1878

" be a child of $parentType?" if $DEBUG;

1879

if ($hDescendants->{$parentType}{$childType}) {

1880

warn "\tYES, $childType can be a child of $parentType" if $DEBUG;

1881

1882

#NML need to deal with multiple children matched to multiple different

1883

#parents. This model only assumes the first parent id that matches a child will

1884

#be the reserved feature.

1885

1886

$hReserved->{$parentID}{$parent}{'parent'} = $parent;

1887

push @{$hReserved->{$parentID}{$parent}{'children'}}, $child;

1888

1889

#mark parent for later removal from all IDs

1890

#so we don't accidentally change any parents

1891

1892

$parentsToRemove{$i}++;

1893

1894

next CHILD;

1895

}

1896

}

1897

1898

1899

1900

#NML shouldn't have to check this; somehow child can lose Parent

1901

#it's happening W3110

1902

#need to track this down

1903

if ( $child->has_tag('Parent') ) {

1904

1905

warn "\tNO, @{[$child->primary_tag]} cannot be a child of $parentID"

1906

if $DEBUG;

1907

1908

my %parents;

1909

1910

map { $parents{$_} = 1 } $child->get_tag_values('Parent');

1911

1912

delete $parents{$parentID};

1913

my @parents = keys %parents;

1914

1915

warn 'VALIDATION ERROR '.$child->primary_tag." at ".$child->start .

1916

' to ' . $child->end . " cannot be a child of ID $parentID"

1917

if $DEBUG;

1918

1919

$child->add_tag_value( validation_error =>

1920

"feature cannot be a child of $parentID");

1921

1922

$child->remove_tag('Parent');

1923

1924

unless ($child->has_tag('ID')) {

1925

my $id = gene_name($child);

1926

$child->add_tag_value('ID', $id);

1927

push @{$hAll->{$id}}, $child

1928

}

1929

1930

$child->add_tag_value('Parent', @parents) if @parents;

1931

}

1932

1933

}

1934

1935

#delete $aParents->[$_] for keys %parentsToRemove;

1936

splice(@$aParents, $_, 1) for keys %parentsToRemove;

1937

}

1938

1939

sub id_validate {

1940

my ($hAll, $hReserved) = @_;

1941

1942

1943

for my $id (keys %$hAll) {

1944

1945

#since 1 feature can have this id,

1946

#let's just shift it off and uniquify

1947

#the rest (unless it's reserved)

1948

1949

shift @{$hAll->{$id}} unless $hReserved->{$id};

1950

for my $feat (@{$hAll->{$id}}) {

1951

id_uniquify(0, $id, $feat, $hAll);

1952

}

1953

}

1954

1955

for my $parentID (keys %$hReserved) {

1956

1957

my @keys = keys %{$hReserved->{$parentID}};

1958

1959

shift @keys;

1960

1961

for my $k (@keys) {

1962

my $parent = $hReserved->{$parentID}{$k}{'parent'};

1963

my $aChildren= $hReserved->{$parentID}{$k}{'children'};

1964

1965

my $value = id_uniquify(0, $parentID, $parent, $hAll);

1966

for my $child (@$aChildren) {

1967

1968

my %parents;

1969

map { $parents{$_}++ } $child->get_tag_values('Parent');

1970

$child->remove_tag('Parent');

1971

delete $parents{$parentID};

1972

$parents{$value}++;

1973

my @parents = keys %parents;

1974

$child->add_tag_value('Parent', @parents);

1975

}

1976

1977

}

1978

}

1979

}

1980

1981

sub id_uniquify {

1982

my ($count, $value, $feat, $hAll) = @_;

1983

1984

warn "UNIQUIFYING $value" if $DEBUG;

1985

1986

if (! $count) {

1987

$feat->add_tag_value(Alias => $value);

1988

$value .= ('.' . $feat->primary_tag)

1989

} elsif ($count == 1) {

1990

$value .= ".$count"

1991

} else {

1992

chop $value;

1993

$value .= $count

1994

}

1995

$count++;

1996

1997

warn "ENDED UP WITH $value" if $DEBUG;

1998

if ( $hAll->{$value} ) {

1999

warn "$value IS ALREADY TAKEN" if $DEBUG;

2000

id_uniquify($count, $value, $feat, $hAll);

2001

} else {

2002

#warn "something's breaking ".$feat->primary_tag.' at '.$feat->start.' to '.$feat->end;

2003

$feat->remove_tag('ID');

2004

$feat->add_tag_value('ID', $value);

2005

push @{$hAll->{$value}}, $value;

2006

}

2007

2008

$value;

2009

}

2010

2011

sub conf_read {

2012

2013

print "\nCannot read $CONF. Change file permissions and retry, " .

2014

"or enter another file\n" and conf_locate() unless -r $CONF;

2015

2016

print "\nCannot write $CONF. Change file permissions and retry, " .

2017

"or enter another file\n" and conf_locate() unless -w $CONF;

2018

2019

$YAML = LoadFile($CONF);

2020

2021

}

2022

2023

sub conf_create {

2024

2025

my ($path, $input) = @_;

2026

2027

print "Cannot write to $path. Change directory permissions and retry " .

2028

"or enter another save path\n" and conf_locate() unless -w $path;

2029

2030

$CONF = $input;

2031

2032

open(FH, '>', $CONF);

2033

close(FH);

2034

conf_read();

2035

2036

2037

}

2038

2039

sub conf_write { DumpFile($CONF, $YAML) }

2040

2041

sub conf_locate {

2042

2043

print "\nEnter the location of a previously saved config, or a new " .

2044

"path and file name to create a new config (this step is " .

2045

"necessary to save any preferences)";

2046

2047

print "\n\nenter a command:";

2048

2049

while (<STDIN>) {

2050

chomp(my $input = $_);

2051

my ($fn, $path, $suffix) = fileparse($input, qr/\.[^.]*/);

2052

2053

if (-e $input && (! -d $input)) {

2054

2055

print "\nReading $input...\n";

2056

$CONF = $input;

2057

2058

conf_read();

2059

last;

2060

2061

} elsif (! -d $input && $fn.$suffix) {

2062

2063

print "Creating $input...\n";

2064

conf_create($path, $input);

2065

last;

2066

2067

} elsif (-e $input && -d $input) {

2068

print "You only entered a directory. " .

2069

"Please enter BOTH a directory and filename\n";

2070

} else {

2071

print "$input does not appear to be a valid path. Please enter a " .

2072

"valid directory and filename\n";

2073

}

2074

print "\nenter a command:";

2075

}

2076

}

2077

2078

sub curation_save {

2079

2080

my ($feat, $input) = @_;

2081

2082

#my $error = "Enter the location of a previously saved config, or a new " .

2083

#"path and file name to create a new config (this step is " .

2084

#"necessary to save any preferences)\n";

2085

2086

if (!$CONF) {

2087

print "\n\n";

2088

conf_locate();

2089

} elsif (! -e $CONF) {

2090

print "\n\nThe config file you have chosen doesn't exist.\n";

2091

conf_locate();

2092

} else { conf_read() }

2093

2094

my $entry = GenBank_entry($feat, "\r", 1);

2095

2096

my $msg = "Term entered: $input";

2097

my $directions = "Please select any/all tags that provide evidence for the term you

2098

have entered. You may enter multiple tags by separating them by commas/dashes

2099

(e.g 1,3,5-7). For tags with more than one word value (i.e 'note'), you have

2100

the option of either selecting the entire note as evidence, or specific

2101

keywords. If a tag has multiple keywords, they will be tagged alphabetically

2102

for selection. To select a specific keyword in a tag field, you must enter the

2103

tag number followed by the keyword letter (e.g 3a). Multiple keywords may be

2104

selected by entering each letter separated by commas/dashes (e.g 3b,f,4a-c). The more tags you select, the more specific the GenBank entry will have

2105

to be to match your curation. To match the GenBank entry exactly as it

2106

appears, type every number (start-end), or just type 'all'. Remember, once the converter saves your

2107

preference, you will no longer be prompted to choose a feature type for any

2108

matching entries until you edit the curation.ini file.";

2109

my $msg_copy = $msg;

2110

my $dir_copy = $directions;

2111

2112

my $format = "format STDOUT = \n" .

2113

'-' x 156 . "\n" .

2114

'^' . '<' x 77 . '| Directions:' . "\n" .

2115

'$msg_copy' . "\n" .

2116

'-' x 156 . "\n" .

2117

' ' x 78 . "|\n" .

2118

'^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" .

2119

'$entry' . ' ' x 74 . '$dir_copy,' . "\n" .

2120

(' ' x 15 . '^' . '<' x 62 . '| ^' . '<' x 75 . '~' . "\n" .

2121

' ' x 15 . '$entry,' . ' ' x 58 . '$dir_copy,' . "\n") x 20 . ".\n";

2122

2123

{

2124

# eval throws redefined warning that breaks formatting.

2125

# Turning off warnings just for the eval to fix this.

2126

local $^W = 0;

2127

eval $format;

2128

}

2129

2130

write;

2131

print '-' x 156 . "\nenter a command:";

2132

2133

my @tags = words_tag($feat);

2134

2135

my $final = {};

2136

my $choices;

2137

while (<STDIN>) {

2138

2139

chomp(my $choice = $_);

2140

2141

if (scalar(keys %$final) && $choice =~ /^y/i) { last

2142

2143

} elsif (scalar(keys %$final) && $choice =~ /^n/i) { curation_save($feat, $input)

2144

2145

} elsif (scalar(keys %$final)) { print "\nInvalid selection. Please try again\n";

2146

2147

} elsif ($choice eq 'all') {

2148

2149

$choice = '';

2150

for (my $i=1; $i < scalar(@tags); $i++) {

2151

$choice .= "$i,";

2152

}

2153

chop $choice;

2154

}

2155

# print "CHOICE [$choice]";

2156

2157

my @selections;

2158

for (split(/(?<=\w)[^[:alnum:]\-]+(?=\d)/, $choice)) {

2159

if ($_ =~ /(\d+)(?:\D*)-(\d+)(.*)/) {

2160

2161

for ($1..$2) { push @selections, $_ }

2162

2163

my $dangling_alphas = $3;

2164

alpha_expand($dangling_alphas, \@selections);

2165

2166

} else {

2167

alpha_expand($_, \@selections);

2168

}

2169

}

2170

2171

foreach my $numbers (@selections) {

2172

2173

my @c = split(/(?=[\w])/, $numbers);

2174

s/\W+//g foreach @c;

2175

my $num;

2176

2177

{

2178

$^W = 0;

2179

$num = 0 + shift @c;

2180

}

2181

2182

my $tag = $tags[$num];

2183

if (ref $tag && scalar(@c)) {

2184

my $no_value;

2185

foreach (@c) {

2186

if (defined $tag->{$_}) {

2187

$choices .= "${num}[$_] ";

2188

my ($t,$v) = @{$tag->{$_}};

2189

push @{${$final->{$input}}[0]{$t}}, $v;

2190

2191

} else { $no_value++ }

2192

}

2193

2194

if ($no_value) {

2195

_selection_add($tag,$final,$input,\$choices,$num);

2196

#my ($t,$v) = @{$tag->{'all'}};

2197

#unless (defined ${$final->{$input}}[0]{$t}) {

2198

#$choices .= "$num, ";

2199

#push @{${$final->{$input}}[0]{$t}}, $v

2200

2201

}

2202

2203

$choices = substr($choices, 0, -2);

2204

$choices .= ', ';

2205

2206

} elsif (ref $tag) {

2207

_selection_add($tag,$final,$input,\$choices,$num);

2208

#my ($t,$v) = @{$tag->{'all'}};

2209

#unless (defined ${$final->{$input}}[0]{$t}) {

2210

#$choices .= "$num, ";

2211

#push @{${$final->{$input}}[0]{$t}}, $v

2212

2213

}

2214

}

2215

$choices = substr($choices, 0, -2) if $choices;

2216

if ($final) {

2217

print "\nYou have chosen the following tags:\n$choices\n";

2218

print "This will be written to the config file as:\n";

2219

print Dump $final;

2220

print "\nIs this correct? (y or n)\n";

2221

} else { print "\nInvalid selection. Please try again\n" }

2222

}

2223

push @{$YAML->{$input}}, $final->{$input};

2224

conf_write();

2225

}

2226

2227

# words_tag() splits each tag value string into multiple words so that the

2228

# user can select the parts he/she wants to use for curation

2229

# it can tag 702 (a - zz) separate words; this should be enough

2230

2231

sub words_tag {

2232

2233

my ($feat, $entry) = @_;

2234

2235

my @tags;

2236

2237

@tags[1,2] = ({'all' => ['primary_tag', $feat->primary_tag]}, {'all' => ['location', $feat->start.'..'.$feat->end]});

2238

my $i = 3;

2239

foreach my $tag ($feat->all_tags) {

2240

foreach my $value ($feat->each_tag_value($tag)) {

2241

2242

my ($string, $tagged_string);

2243

2244

my @words = split(/(?=\w+?)/, $value);

2245

2246

my $pos = 0;

2247

2248

2249

foreach my $word (@words) {

2250

2251

(my $sanitized_word = $word) =~ s/\W+?//g;

2252

$string .= $word;

2253

2254

my $lead = int($pos/ALPHABET_DIVISOR);

2255

my $lag = $pos % ALPHABET_DIVISOR;

2256

2257

my $a = $lead ? ${(ALPHABET)}[$lead-1] : '';

2258

$a .= $lag ? ${(ALPHABET)}[$lag] : 'a';

2259

2260

$tagged_string .= " ($a) $word";

2261

2262

$tags[$i]{$a} = [$tag, $sanitized_word];

2263

$pos++;

2264

}

2265

2266

$value = $tagged_string if scalar(@words) > 1;

2267

2268

$$entry .= "[$i] /$tag=\"$value\"\r";

2269

2270

$tags[$i]{'all'} = [$tag, $string];

2271

}

2272

$i++;

2273

}

2274

2275

return @tags;

2276

2277

}

2278

2279

sub alpha_expand {

2280

2281

my ($dangling_alphas, $selections) = @_;

2282

2283

if (defined($dangling_alphas) && $dangling_alphas =~ /(\d)*([[:alpha:]]+)-([[:alpha:]]+)/) {

2284

2285

my $digit = $1;

2286

push @$selections, $digit if $digit;

2287

2288

my $start = $2;

2289

my $stop = $3;

2290

2291

my @starts = split('', $start);

2292

my @stops = split('', $stop);

2293

2294

my ($final_start, $final_stop);

2295

2296

for ([\$final_start, \@starts], [\$final_stop, \@stops]) {

2297

2298

my ($final, $splits) = @$_;

2299

2300

my $int = ${(ALPHABET_TO_NUMBER)}{$$splits[0]};

2301

my $rem;

2302

2303

2304

if ($$splits[1]) {

2305

$rem = ${(ALPHABET_TO_NUMBER)}{$$splits[1]};

2306

$int++

2307

} else {

2308

$rem = $int;

2309

$int = 0;

2310

}

2311

2312

2313

$$final = $int * ALPHABET_DIVISOR;

2314

$$final += $rem;

2315

2316

}

2317

2318

my $last_number = pop @$selections;

2319

for my $pos ($final_start..$final_stop) {

2320

my $lead = int($pos/ALPHABET_DIVISOR);

2321

my $lag = $pos % ALPHABET_DIVISOR;

2322

my $alpha = $lead ? ${(ALPHABET)}[$lead-1] : '';

2323

$alpha .= $lag ? ${(ALPHABET)}[$lag] : 'a';

2324

push @$selections, $last_number.$alpha;

2325

}

2326

2327

} elsif (defined($dangling_alphas)) {

2328

if ($dangling_alphas =~ /^\d/) {

2329

push @$selections, $dangling_alphas;

2330

} elsif ($dangling_alphas =~ /^\D/) {

2331

#print "$dangling_alphas ".Dumper @$selections;

2332

my $last_number = pop @$selections;

2333

$last_number ||= '';

2334

push @$selections, $last_number.$dangling_alphas;

2335

#$$selections[-1] .= $dangling_alphas;

2336

}

2337

}

2338

2339

}

2340

2341

sub _selection_add {

2342

2343

my ($tag, $final, $input, $choices, $num) = @_;

2344

my ($t,$v) = @{$tag->{'all'}};

2345

unless (defined ${$final->{$input}}[0]{$t}) {

2346

$$choices .= "$num, ";

2347

push @{${$final->{$input}}[0]{$t}}, $v

2348

}

2349

2350

}

Older »