1182
my ($self, @args) = @_;
1184
my($sf, $seq, $type_map, $syn_map, $undefmap) =
1185
$self->_rearrange([qw(FEATURE
1193
if (!$sf && !$seq) {
1194
$self->throw("you need to pass in either -feature or -seq");
1199
$seq->isa("Bio::SeqI") || $self->throw("$seq NOT A SeqI");
1200
@sfs = $seq->get_all_SeqFeatures;
1202
$type_map = $type_map || $self->typemap; # dgg: was type_map;
1203
foreach my $feat (@sfs) {
1205
$feat->isa("Bio::SeqFeatureI") || $self->throw("$feat NOT A SeqFeatureI");
1206
$feat->isa("Bio::FeatureHolderI") || $self->throw("$feat NOT A FeatureHolderI");
1208
my $primary_tag = $feat->primary_tag;
1210
#if ($primary_tag =~ /^pseudo(.*)$/) {
1212
#$feat->primary_tag($primary_tag);
1215
my $mtype = $type_map->{$primary_tag};
1218
if (ref($mtype) eq 'ARRAY') {
1220
($mtype, $soID) = @$mtype;
1222
if ($soID && ref($ONTOLOGY)) {
1223
my ($term) = $ONTOLOGY->find_terms(-identifier => $soID);
1224
$mtype = $term->name if $term;
1226
# if SO ID is undefined AND we have an ontology to search, we want to delete
1227
# the feature type hash entry in order to force a fuzzy search
1228
elsif (! defined $soID && ref($ONTOLOGY)) {
1230
delete $type_map->{$primary_tag};
1232
elsif ($undefmap && $mtype eq 'undefined') { # dgg
1236
$type_map->{$primary_tag} = $mtype if $mtype;
1238
elsif (ref($mtype) eq 'CODE') {
1239
$mtype = $mtype->($feat);
1242
$self->throw('must be scalar or CODE ref');
1245
elsif ($undefmap && $mtype eq 'undefined') { # dgg
1248
$feat->primary_tag($mtype);
1253
my %perfect_matches;
1254
while (my ($p_tag,$rules) = each %$YAML) {
1256
for my $rule (@$rules) {
1257
for my $tags (@$rule) {
1258
while (my ($tag,$values) = each %$tags) {
1259
for my $value (@$values) {
1260
if ($feat->has_tag($tag)) {
1261
for ($feat->get_tag_values($tag)) {
1262
next RULE unless $_ =~ /\Q$value\E/;
1264
} elsif ($tag eq 'primary_tag') {
1265
next RULE unless $value eq
1267
} elsif ($tag eq 'location') {
1268
next RULE unless $value eq
1269
$feat->start.'..'.$feat->end;
1270
} else { next RULE }
1274
$perfect_matches{$p_tag}++;
1277
if (scalar(keys %perfect_matches) == 1) {
1278
$mtype = $_ for keys %perfect_matches;
1279
} elsif (scalar(keys %perfect_matches) > 1) {
1280
warn "There are conflicting rules in the config file for the" .
1281
" following types: ";
1282
warn "\t$_\n" for keys %perfect_matches;
1283
warn "Until conflict resolution is built into the converter," .
1284
" you will have to manually edit the config file to remove the" .
1285
" conflict. Sorry :(. Skipping user preference for this entry";
1290
if ( ! $mtype && $syn_map) {
1291
if ($feat->has_tag('note')) {
1295
my @note = $feat->each_tag_value('note');
1297
for my $k (keys %$syn_map) {
1299
if ($k =~ /"(.+)"/) {
1303
for my $note (@note) {
1305
# look through the notes to see if the description
1306
# is an exact match for synonyms
1307
if ( $syn eq $note ) {
1309
my @map = @{$syn_map->{$k}};
1312
my $best_guess = $map[0];
1314
unshift @{$all_matches[-1]}, [$best_guess];
1317
? manual_curation($feat, $best_guess, \@all_matches)
1320
print '#' x 78 . "\nGuessing the proper SO term for GenBank"
1321
. " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n"
1322
. '#' x 78 . "\n\n";
1325
# check both primary tag and and note against
1326
# SO synonyms for best matching description
1328
SO_fuzzy_match( $k, $primary_tag, $note, $syn, \@all_matches);
1336
for my $note (@note) {
1337
for my $name (values %$type_map) {
1338
# check primary tag against SO names for best matching
1339
# descriptions //NML also need to check against
1340
# definition && camel case split terms
1342
SO_fuzzy_match($name, $primary_tag, $note, $name, \@all_matches);
1347
if (scalar(@all_matches) && !$mtype) {
1349
my $top_matches = first { defined $_ } @{$all_matches[-1]};
1351
my $best_guess = $top_matches->[0];
1355
# if guess has quotes, it is a synonym term. we need to
1356
# look up the corresponding name term
1357
# otherwise, guess is a name, so we can use it directly
1358
if ($best_guess =~ /"(.+)"/) {
1360
$best_guess = $syn_map->{$best_guess}->[0];
1364
@RETURN = @all_matches;
1366
? manual_curation($feat, $best_guess, \@all_matches)
1369
print '#' x 78 . "\nGuessing the proper SO term for GenBank"
1370
. " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n"
1371
. '#' x 78 . "\n\n";
1375
$mtype ||= $undefmap;
1376
$feat->primary_tag($mtype);
1383
sub SO_fuzzy_match {
1385
my $candidate = shift;
1386
my $primary_tag = shift;
1388
my $SO_terms = shift;
1389
my $best_matches_ref = shift;
1390
my $modifier = shift;
1396
for ( split(" |_", $primary_tag) ) {
1397
#my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g;
1398
my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g;
1399
push @feat_terms, @camelCase;
1402
for ( split(" |_", $note) ) {
1403
#my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g;
1404
#my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g;
1405
(my $word = $_) =~ s/[;:.,]//g;
1406
push @feat_terms, $word;
1410
my @SO_terms = split(" |_", $SO_terms);
1412
# fuzzy match works on a simple point system. When 2 words match,
1413
# the $plus counter adds one. When they don't, the $minus counter adds
1414
# one. This is used to sort similar matches together. Better matches
1415
# are found at the end of the array, near the top.
1417
# NML: can we improve best match by using synonym tags
1418
# EXACT,RELATED,NARROW,BROAD?
1420
my ($plus, $minus) = (0, 0);
1425
map {$feat_terms{$_} = 1} @feat_terms;
1426
map {$SO_terms{$_} = 1} @SO_terms;
1428
for my $st (keys %SO_terms) {
1429
for my $ft (keys %feat_terms) {
1431
($st =~ m/$modifier\Q$ft\E/) ? $plus++ : $minus++;
1436
push @{$$best_matches_ref[$plus][$minus]}, $candidate if $plus;
1440
sub manual_curation {
1442
my ($feat, $default_opt, $all_matches) = @_;
1444
my @all_matches = @$all_matches;
1446
# convert all SO synonyms into names and filter
1447
# all matches into unique term list because
1448
# synonyms can map to multiple duplicate names
1450
my (@unique_SO_terms, %seen);
1451
for (reverse @all_matches) {
1455
if ($_ =~ /"(.+)"/) {
1456
for (@{$SYN_MAP->{$_}}) {
1457
push @unique_SO_terms, $_ unless $seen{$_};
1461
push @unique_SO_terms, $_ unless $seen{$_};
1468
my $s = scalar(@unique_SO_terms);
1473
"[a]uto : automatic input (selects best guess for remaining entries)\r" .
1474
"[f]ind : search for other SO terms matching your query (e.g. f gene)\r" .
1475
"[i]nput : add a specific term\r" .
1476
"[r]eset : reset to the beginning of matches\r" .
1477
"[s]kip : skip this entry (selects best guess for this entry)\r"
1481
"[n]ext : view the next ".OPTION_CYCLE." terms\r" .
1482
"[p]rev : view the previous ".OPTION_CYCLE." terms" if ($s > OPTION_CYCLE);
1484
my $msg = #"\n\n" . '-' x 156 . "\n"
1485
"The converter found $s possible matches for the following GenBank entry: ";
1488
"Type a number to select the SO term that best matches"
1489
. " the genbank entry, or use any of the following options:\r" . '_' x 76 . "\r$more";
1492
# lookup filtered list to pull out definitions
1496
for (['name', 'name'], ['def', 'definition'], ['synonym',
1498
my ($label, $method) = @$_;
1499
$term{$label} = \@{[$term->$method]};
1501
[++$choice, $_->name, ($_->definition || 'none'), \%term, $_->each_synonym ];
1502
} map { $ONTOLOGY->find_terms(-name => $_) } @unique_SO_terms;
1505
my $option = options_cycle(0, OPTION_CYCLE, $msg, $feat, $directions,
1506
$default_opt, @options);
1508
if ($option eq 'skip') { return $default_opt
1509
} elsif ($option eq 'auto') {
1511
return $default_opt;
1512
} else { return $option }
1518
my ($start, $stop, $msg, $feat, $directions, $best_guess, @opt) = @_;
1520
#NML: really should only call GenBank_entry once. Will need to change
1521
#method to return array & shift off header
1522
my $entry = GenBank_entry($feat, "\r");
1524
my $total = scalar(@opt);
1526
($start,$stop) = (0, OPTION_CYCLE)
1527
if ( ($start < 0) && ($stop > 0) );
1529
($start,$stop) = (0, OPTION_CYCLE)
1530
if ( ( ($stop - $start) < OPTION_CYCLE ) && $stop < $total);
1532
($start,$stop) = ($total - OPTION_CYCLE, $total) if $start < 0;
1533
($start,$stop) = (0, OPTION_CYCLE) if $start >= $total;
1535
$stop = $total if $stop > $total;
1537
my $dir_copy = $directions;
1538
my $msg_copy = $msg;
1539
my $format = "format STDOUT = \n" .
1541
'^' . '<' x 77 . '| Available Commands:' . "\n" .
1542
'$msg_copy' . "\n" .
1545
'^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" .
1546
'$entry' . ' ' x 74 . '$dir_copy,' . "\n" .
1547
(' ' x 20 . '^' . '<' x 57 . '| ^' . '<' x 75 . '~' . "\n" .
1548
' ' x 20 . '$entry,' . ' ' x 53 . '$dir_copy,' . "\n") x 1000 . ".\n";
1551
# eval throws redefined warning that breaks formatting.
1552
# Turning off warnings just for the eval to fix this.
1559
print '-' x 156 . "\n" .
1560
'Showing results ' . ( $stop ? ( $start + 1 ) : $start ) .
1561
" - $stop of possible SO term matches: (best guess is \"$best_guess\")" .
1562
"\n" . '-' x 156 . "\n";
1564
for (my $i = $start; $i < $stop; $i+=2) {
1566
my ($left, $right) = @opt[$i,$i+1];
1568
my ($nL, $nmL, $descL, $termL, @synL) = @$left;
1570
#odd numbered lists can cause fatal undefined errors, so check
1571
#to make sure we have data
1573
my ($nR, $nmR, $descR, $termR, @synR) = ref($right) ? @$right : (undef, undef, undef);
1576
my $format = "format STDOUT = \n";
1581
'@>>: name: ^' . '<' x 64 . '~' . ' |' .
1582
( ref($right) ? ('@>>: name: ^' . '<' x 64 . '~' ) : '' ) . "\n" .
1583
'$nL,' . ' ' x 7 . '$nmL,' .
1584
( ref($right) ? (' ' x 63 . '$nR,' . ' ' x 7 . "\$nmR,") : '' ) . "\n" .
1586
' ' x 11 . '^' . '<' x 61 . '...~' . ' |' .
1587
(ref($right) ? (' ^' . '<' x 61 . '...~') : '') . "\n" .
1588
' ' x 11 . '$nmL,' .
1589
(ref($right) ? (' ' x 74 . '$nmR,') : '') . "\n" .
1590
#' ' x 78 . '|' . "\n" .
1593
' def: ^' . '<' x 65 . ' |' .
1594
(ref($right) ? (' def: ^' . '<' x 64 . '~') : '') . "\n" .
1595
' ' x 11 . '$descL,' .
1596
(ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" .
1599
(' ^' . '<' x 65 . ' |' .
1600
(ref($right) ? (' ^' . '<' x 64 . '~') : '') . "\n" .
1601
' ' x 11 . '$descL,' .
1602
(ref($right) ? (' ' x 72 . '$descR,') : '') . "\n") x 5 .
1605
' ^' . '<' x 61 . '...~ |' .
1606
(ref($right) ? (' ^' . '<' x 61 . '...~') : '') . "\n" .
1607
' ' x 11 . '$descL,' .
1608
(ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" .
1613
# eval throws redefined warning that breaks formatting.
1614
# Turning off warnings just for the eval to fix this.
1621
print '-' x 156 . "\nenter a command:";
1625
(my $input = $_) =~ s/\s+$//;
1627
if ($input =~ /^\d+$/) {
1628
if ( $input && defined $opt[$input-1] ) {
1629
return $opt[$input-1]->[1]
1631
print "\nThat number is not an option. Please enter a valid number.\n:";
1633
} elsif ($input =~ /^n/i | $input =~ /next/i ) {
1634
return options_cycle($start + OPTION_CYCLE, $stop + OPTION_CYCLE, $msg,
1635
$feat, $directions, $best_guess, @opt)
1636
} elsif ($input =~ /^p/i | $input =~ /prev/i ) {
1637
return options_cycle($start - OPTION_CYCLE, $stop - OPTION_CYCLE, $msg,
1638
$feat, $directions, $best_guess, @opt)
1639
} elsif ( $input =~ /^s/i || $input =~ /skip/i ) { return 'skip'
1640
} elsif ( $input =~ /^a/i || $input =~ /auto/i ) { return 'auto'
1641
} elsif ( $input =~ /^r/i || $input =~ /reset/i ) {
1642
return manual_curation($feat, $best_guess, \@RETURN );
1643
} elsif ( $input =~ /^f/i || $input =~ /find/i ) {
1645
my ($query, @query_results);
1647
if ($input =~ /(?:^f|find)\s+?(.*?)$/) { $query = $1;
1651
print "Type your search query\n:";
1652
while (<STDIN>) { chomp($query = $_); last }
1655
for (keys(%$TYPE_MAP), keys(%$SYN_MAP)) {
1656
SO_fuzzy_match($_, $query, '', $_, \@query_results, '(?i)');
1659
return manual_curation($feat, $best_guess, \@query_results);
1661
} elsif ( $input =~ /^i/i || $input =~ /input/i ) {
1663
#NML fast input for later
1665
#if ($input =~ /(?:^i|input)\s+?(.*?)$/) { $query = $1 };
1668
print "Type the term you want to use\n:";
1670
chomp(my $input = $_);
1672
if (! $TYPE_MAP->{$input}) {
1674
print "\"$input\" doesn't appear to be a valid SO term. Are ".
1675
"you sure you want to use it? (y or n)\n:";
1679
chomp(my $choice = $_);
1681
if ($choice eq 'y') {
1683
"\nWould you like to save your preference for " .
1684
"future use (so you don't have to redo manual " .
1685
"curation for this feature everytime you run " .
1686
"the converter)? (y or n)\n";
1688
#NML: all these while loops are a mess. Really should condense it.
1691
chomp(my $choice = $_);
1693
if ($choice eq 'y') {
1694
curation_save($feat, $input);
1696
} elsif ($choice eq 'n') {
1699
print "\nDidn't recognize that command. Please " .
1705
} elsif ($choice eq 'n') {
1706
return options_cycle($start, $stop, $msg, $feat,
1707
$directions, $best_guess, @opt)
1709
print "\nDidn't recognize that command. Please " .
1716
"\nWould you like to save your preference for " .
1717
"future use (so you don't have to redo manual " .
1718
"curation for this feature everytime you run " .
1719
"the converter)? (y or n)\n";
1721
#NML: all these while loops are a mess. Really should condense it.
1724
chomp(my $choice = $_);
1726
if ($choice eq 'y') {
1727
curation_save($feat, $input);
1729
} elsif ($choice eq 'n') {
1732
print "\nDidn't recognize that command. Please " .
1741
print "\nDidn't recognize that command. Please re-enter your choice.\n:"
1748
my ($f, $delimiter, $num) = @_;
1750
$delimiter ||= "\n";
1755
($num ? ' [1] ' : ' ' x 5) . $f->primary_tag
1757
? ' ' x (12 - length $f->primary_tag ) . ' [2] '
1758
: ' ' x (15 - length $f->primary_tag)
1760
. $f->start.'..'.$f->end
1765
words_tag($f, \$entry);
1767
for my $tag ($f->all_tags) {
1768
for my $val ( $f->each_tag_value($tag) ) {
1770
#$entry .= "/$tag=\"$val\"$delimiter";
1771
$entry .= $val eq '_no_value'
1773
: "/$tag=\"$val\"$delimiter";
1784
warn "Validating GFF file\n" if $DEBUG;
1787
my (%parent2child, %all_ids, %descendants, %reserved);
1790
for my $aTags (['Parent', \%parent2child], ['ID', \%all_ids]) {
1791
map { push @{$$aTags[1]->{$_}}, $f } $f->get_tag_values($$aTags[0])
1792
if $f->has_tag($$aTags[0]);
1797
while (my ($parentID, $aChildren) = each %parent2child) {
1798
parent_validate($parentID, $aChildren, \%all_ids, \%descendants, \%reserved);
1802
id_validate(\%all_ids, \%reserved);
1805
sub parent_validate {
1806
my ($parentID, $aChildren, $hAll, $hDescendants, $hReserved) = @_;
1808
my $aParents = $hAll->{$parentID};
1812
$child->add_tag_value( validation_error =>
1813
"feature tried to add Parent tag, but no Parent found with ID $parentID"
1816
map { $parents{$_} = 1 } $child->get_tag_values('Parent');
1817
delete $parents{$parentID};
1818
my @parents = keys %parents;
1820
$child->remove_tag('Parent');
1822
unless ($child->has_tag('ID')) {
1823
my $id = gene_name($child);
1824
$child->add_tag_value('ID', $id);
1825
push @{$hAll->{$id}}, $child
1828
$child->add_tag_value('Parent', @parents) if @parents;
1830
} @$aChildren and return unless scalar(@$aParents);
1832
my $par = join(',', map { $_->primary_tag } @$aParents);
1833
warn scalar(@$aParents)." POSSIBLE PARENT(S): $par" if $DEBUG;
1835
#NML manual curation needs to happen here
1838
my %parentsToRemove;
1841
for my $child (@$aChildren) {
1842
my $childType = $child->primary_tag;
1844
warn "WORKING ON $childType at ".$child->start.' to '.$child->end
1847
for (my $i = 0; $i < scalar(@$aParents); $i++) {
1848
my $parent = $aParents->[$i];
1849
my $parentType = $parent->primary_tag;
1851
warn "CHECKING $childType against $parentType" if $DEBUG;
1853
#cache descendants so we don't have to do repeat searches
1854
unless ($hDescendants->{$parentType}) {
1856
for my $term ($ONTOLOGY->find_terms(
1857
-name => $parentType
1861
$hDescendants->{$parentType}{$_->name}++
1862
} $ONTOLOGY->get_descendant_terms($term);
1866
# NML: hopefully temporary fix.
1867
# SO doesn't consider exon/CDS to be a child of mRNA
1868
# even though common knowledge dictates that they are
1869
# This cheat fixes the false positives for now
1870
if ($parentType eq 'mRNA') {
1871
$hDescendants->{$parentType}{'exon'} = 1;
1872
$hDescendants->{$parentType}{'CDS'} = 1;
1877
warn "\tCAN $childType at " . $child->start . ' to ' . $child->end .
1878
" be a child of $parentType?" if $DEBUG;
1879
if ($hDescendants->{$parentType}{$childType}) {
1880
warn "\tYES, $childType can be a child of $parentType" if $DEBUG;
1882
#NML need to deal with multiple children matched to multiple different
1883
#parents. This model only assumes the first parent id that matches a child will
1884
#be the reserved feature.
1886
$hReserved->{$parentID}{$parent}{'parent'} = $parent;
1887
push @{$hReserved->{$parentID}{$parent}{'children'}}, $child;
1889
#mark parent for later removal from all IDs
1890
#so we don't accidentally change any parents
1892
$parentsToRemove{$i}++;
1900
#NML shouldn't have to check this; somehow child can lose Parent
1901
#it's happening W3110
1902
#need to track this down
1903
if ( $child->has_tag('Parent') ) {
1905
warn "\tNO, @{[$child->primary_tag]} cannot be a child of $parentID"
1910
map { $parents{$_} = 1 } $child->get_tag_values('Parent');
1912
delete $parents{$parentID};
1913
my @parents = keys %parents;
1915
warn 'VALIDATION ERROR '.$child->primary_tag." at ".$child->start .
1916
' to ' . $child->end . " cannot be a child of ID $parentID"
1919
$child->add_tag_value( validation_error =>
1920
"feature cannot be a child of $parentID");
1922
$child->remove_tag('Parent');
1924
unless ($child->has_tag('ID')) {
1925
my $id = gene_name($child);
1926
$child->add_tag_value('ID', $id);
1927
push @{$hAll->{$id}}, $child
1930
$child->add_tag_value('Parent', @parents) if @parents;
1935
#delete $aParents->[$_] for keys %parentsToRemove;
1936
splice(@$aParents, $_, 1) for keys %parentsToRemove;
1940
my ($hAll, $hReserved) = @_;
1943
for my $id (keys %$hAll) {
1945
#since 1 feature can have this id,
1946
#let's just shift it off and uniquify
1947
#the rest (unless it's reserved)
1949
shift @{$hAll->{$id}} unless $hReserved->{$id};
1950
for my $feat (@{$hAll->{$id}}) {
1951
id_uniquify(0, $id, $feat, $hAll);
1955
for my $parentID (keys %$hReserved) {
1957
my @keys = keys %{$hReserved->{$parentID}};
1962
my $parent = $hReserved->{$parentID}{$k}{'parent'};
1963
my $aChildren= $hReserved->{$parentID}{$k}{'children'};
1965
my $value = id_uniquify(0, $parentID, $parent, $hAll);
1966
for my $child (@$aChildren) {
1969
map { $parents{$_}++ } $child->get_tag_values('Parent');
1970
$child->remove_tag('Parent');
1971
delete $parents{$parentID};
1973
my @parents = keys %parents;
1974
$child->add_tag_value('Parent', @parents);
1982
my ($count, $value, $feat, $hAll) = @_;
1984
warn "UNIQUIFYING $value" if $DEBUG;
1987
$feat->add_tag_value(Alias => $value);
1988
$value .= ('.' . $feat->primary_tag)
1989
} elsif ($count == 1) {
1997
warn "ENDED UP WITH $value" if $DEBUG;
1998
if ( $hAll->{$value} ) {
1999
warn "$value IS ALREADY TAKEN" if $DEBUG;
2000
id_uniquify($count, $value, $feat, $hAll);
2002
#warn "something's breaking ".$feat->primary_tag.' at '.$feat->start.' to '.$feat->end;
2003
$feat->remove_tag('ID');
2004
$feat->add_tag_value('ID', $value);
2005
push @{$hAll->{$value}}, $value;
2013
print "\nCannot read $CONF. Change file permissions and retry, " .
2014
"or enter another file\n" and conf_locate() unless -r $CONF;
2016
print "\nCannot write $CONF. Change file permissions and retry, " .
2017
"or enter another file\n" and conf_locate() unless -w $CONF;
2019
$YAML = LoadFile($CONF);
2025
my ($path, $input) = @_;
2027
print "Cannot write to $path. Change directory permissions and retry " .
2028
"or enter another save path\n" and conf_locate() unless -w $path;
2032
open(FH, '>', $CONF);
2039
sub conf_write { DumpFile($CONF, $YAML) }
2043
print "\nEnter the location of a previously saved config, or a new " .
2044
"path and file name to create a new config (this step is " .
2045
"necessary to save any preferences)";
2047
print "\n\nenter a command:";
2050
chomp(my $input = $_);
2051
my ($fn, $path, $suffix) = fileparse($input, qr/\.[^.]*/);
2053
if (-e $input && (! -d $input)) {
2055
print "\nReading $input...\n";
2061
} elsif (! -d $input && $fn.$suffix) {
2063
print "Creating $input...\n";
2064
conf_create($path, $input);
2067
} elsif (-e $input && -d $input) {
2068
print "You only entered a directory. " .
2069
"Please enter BOTH a directory and filename\n";
2071
print "$input does not appear to be a valid path. Please enter a " .
2072
"valid directory and filename\n";
2074
print "\nenter a command:";
2080
my ($feat, $input) = @_;
2082
#my $error = "Enter the location of a previously saved config, or a new " .
2083
#"path and file name to create a new config (this step is " .
2084
#"necessary to save any preferences)\n";
2089
} elsif (! -e $CONF) {
2090
print "\n\nThe config file you have chosen doesn't exist.\n";
2092
} else { conf_read() }
2094
my $entry = GenBank_entry($feat, "\r", 1);
2096
my $msg = "Term entered: $input";
2097
my $directions = "Please select any/all tags that provide evidence for the term you
2098
have entered. You may enter multiple tags by separating them by commas/dashes
2099
(e.g 1,3,5-7). For tags with more than one word value (i.e 'note'), you have
2100
the option of either selecting the entire note as evidence, or specific
2101
keywords. If a tag has multiple keywords, they will be tagged alphabetically
2102
for selection. To select a specific keyword in a tag field, you must enter the
2103
tag number followed by the keyword letter (e.g 3a). Multiple keywords may be
2104
selected by entering each letter separated by commas/dashes (e.g 3b,f,4a-c). The more tags you select, the more specific the GenBank entry will have
2105
to be to match your curation. To match the GenBank entry exactly as it
2106
appears, type every number (start-end), or just type 'all'. Remember, once the converter saves your
2107
preference, you will no longer be prompted to choose a feature type for any
2108
matching entries until you edit the curation.ini file.";
2109
my $msg_copy = $msg;
2110
my $dir_copy = $directions;
2112
my $format = "format STDOUT = \n" .
2114
'^' . '<' x 77 . '| Directions:' . "\n" .
2115
'$msg_copy' . "\n" .
2118
'^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" .
2119
'$entry' . ' ' x 74 . '$dir_copy,' . "\n" .
2120
(' ' x 15 . '^' . '<' x 62 . '| ^' . '<' x 75 . '~' . "\n" .
2121
' ' x 15 . '$entry,' . ' ' x 58 . '$dir_copy,' . "\n") x 20 . ".\n";
2124
# eval throws redefined warning that breaks formatting.
2125
# Turning off warnings just for the eval to fix this.
2131
print '-' x 156 . "\nenter a command:";
2133
my @tags = words_tag($feat);
2139
chomp(my $choice = $_);
2141
if (scalar(keys %$final) && $choice =~ /^y/i) { last
2143
} elsif (scalar(keys %$final) && $choice =~ /^n/i) { curation_save($feat, $input)
2145
} elsif (scalar(keys %$final)) { print "\nInvalid selection. Please try again\n";
2147
} elsif ($choice eq 'all') {
2150
for (my $i=1; $i < scalar(@tags); $i++) {
2155
# print "CHOICE [$choice]";
2158
for (split(/(?<=\w)[^[:alnum:]\-]+(?=\d)/, $choice)) {
2159
if ($_ =~ /(\d+)(?:\D*)-(\d+)(.*)/) {
2161
for ($1..$2) { push @selections, $_ }
2163
my $dangling_alphas = $3;
2164
alpha_expand($dangling_alphas, \@selections);
2167
alpha_expand($_, \@selections);
2171
foreach my $numbers (@selections) {
2173
my @c = split(/(?=[\w])/, $numbers);
2174
s/\W+//g foreach @c;
2179
$num = 0 + shift @c;
2182
my $tag = $tags[$num];
2183
if (ref $tag && scalar(@c)) {
2186
if (defined $tag->{$_}) {
2187
$choices .= "${num}[$_] ";
2188
my ($t,$v) = @{$tag->{$_}};
2189
push @{${$final->{$input}}[0]{$t}}, $v;
2191
} else { $no_value++ }
2195
_selection_add($tag,$final,$input,\$choices,$num);
2196
#my ($t,$v) = @{$tag->{'all'}};
2197
#unless (defined ${$final->{$input}}[0]{$t}) {
2198
#$choices .= "$num, ";
2199
#push @{${$final->{$input}}[0]{$t}}, $v
2203
$choices = substr($choices, 0, -2);
2206
} elsif (ref $tag) {
2207
_selection_add($tag,$final,$input,\$choices,$num);
2208
#my ($t,$v) = @{$tag->{'all'}};
2209
#unless (defined ${$final->{$input}}[0]{$t}) {
2210
#$choices .= "$num, ";
2211
#push @{${$final->{$input}}[0]{$t}}, $v
2215
$choices = substr($choices, 0, -2) if $choices;
2217
print "\nYou have chosen the following tags:\n$choices\n";
2218
print "This will be written to the config file as:\n";
2220
print "\nIs this correct? (y or n)\n";
2221
} else { print "\nInvalid selection. Please try again\n" }
2223
push @{$YAML->{$input}}, $final->{$input};
2227
# words_tag() splits each tag value string into multiple words so that the
2228
# user can select the parts he/she wants to use for curation
2229
# it can tag 702 (a - zz) separate words; this should be enough
2233
my ($feat, $entry) = @_;
2237
@tags[1,2] = ({'all' => ['primary_tag', $feat->primary_tag]}, {'all' => ['location', $feat->start.'..'.$feat->end]});
2239
foreach my $tag ($feat->all_tags) {
2240
foreach my $value ($feat->each_tag_value($tag)) {
2242
my ($string, $tagged_string);
2244
my @words = split(/(?=\w+?)/, $value);
2249
foreach my $word (@words) {
2251
(my $sanitized_word = $word) =~ s/\W+?//g;
2254
my $lead = int($pos/ALPHABET_DIVISOR);
2255
my $lag = $pos % ALPHABET_DIVISOR;
2257
my $a = $lead ? ${(ALPHABET)}[$lead-1] : '';
2258
$a .= $lag ? ${(ALPHABET)}[$lag] : 'a';
2260
$tagged_string .= " ($a) $word";
2262
$tags[$i]{$a} = [$tag, $sanitized_word];
2266
$value = $tagged_string if scalar(@words) > 1;
2268
$$entry .= "[$i] /$tag=\"$value\"\r";
2270
$tags[$i]{'all'} = [$tag, $string];
2281
my ($dangling_alphas, $selections) = @_;
2283
if (defined($dangling_alphas) && $dangling_alphas =~ /(\d)*([[:alpha:]]+)-([[:alpha:]]+)/) {
2286
push @$selections, $digit if $digit;
2291
my @starts = split('', $start);
2292
my @stops = split('', $stop);
2294
my ($final_start, $final_stop);
2296
for ([\$final_start, \@starts], [\$final_stop, \@stops]) {
2298
my ($final, $splits) = @$_;
2300
my $int = ${(ALPHABET_TO_NUMBER)}{$$splits[0]};
2305
$rem = ${(ALPHABET_TO_NUMBER)}{$$splits[1]};
2313
$$final = $int * ALPHABET_DIVISOR;
2318
my $last_number = pop @$selections;
2319
for my $pos ($final_start..$final_stop) {
2320
my $lead = int($pos/ALPHABET_DIVISOR);
2321
my $lag = $pos % ALPHABET_DIVISOR;
2322
my $alpha = $lead ? ${(ALPHABET)}[$lead-1] : '';
2323
$alpha .= $lag ? ${(ALPHABET)}[$lag] : 'a';
2324
push @$selections, $last_number.$alpha;
2327
} elsif (defined($dangling_alphas)) {
2328
if ($dangling_alphas =~ /^\d/) {
2329
push @$selections, $dangling_alphas;
2330
} elsif ($dangling_alphas =~ /^\D/) {
2331
#print "$dangling_alphas ".Dumper @$selections;
2332
my $last_number = pop @$selections;
2333
$last_number ||= '';
2334
push @$selections, $last_number.$dangling_alphas;
2335
#$$selections[-1] .= $dangling_alphas;
2341
sub _selection_add {
2343
my ($tag, $final, $input, $choices, $num) = @_;
2344
my ($t,$v) = @{$tag->{'all'}};
2345
unless (defined ${$final->{$input}}[0]{$t}) {
2346
$$choices .= "$num, ";
2347
push @{${$final->{$input}}[0]{$t}}, $v