1
# $Id: elink.pm,v 1.24.4.3 2006/11/10 16:48:09 cjfields Exp $
3
# BioPerl module for Bio::DB::EUtilities::elink
5
# Cared for by Chris Fields <cjfields at uiuc dot edu>
7
# Copyright Chris Fields
9
# You may distribute this module under the same terms as perl itself
11
# POD documentation - main docs before the code
13
# Part of the EUtilities BioPerl package
17
Bio::DB::EUtilities::elink - check for and retrieve external or related ID's
18
from a list of one or more primary ID's, including relevancy scores.
22
B<Do not use this module directly.> Use it via the
23
L<Bio::DB::EUtilities|Bio::DB::EUtilities> class.
25
# chain EUtilities for complex queries
27
use Bio::DB::EUtilities;
29
my $esearch = Bio::DB::EUtilities->new(-eutil => 'esearch',
34
$esearch->get_response; # parse the response, fetch a cookie
36
my $elink = Bio::DB::EUtilities->new(-eutil => 'elink',
37
-db => 'protein,taxonomy',
39
-cookie => $esearch->next_cookie,
42
# this retrieves the Bio::DB::EUtilities::ElinkData object
44
my ($linkset) = $elink->next_linkset;
47
# step through IDs for each linked database in the ElinkData object
49
for my $db ($linkset->get_databases) {
50
@ids = $linkset->get_LinkIds_by_db($db); #returns primary ID's
54
# multiple ID groups (for one-to-one-correspondence of IDs)
56
my $elink = Bio::DB::EUtilities->new(-eutil => 'elink',
59
-id => [\@id1, @ids2],
63
while (my $linkset = $elink->next_linkset) {
64
for my $db ($linkset->get_databases) {
65
my @ids = $linkset->get_LinkIds_by_db($db); #returns primary ID's
70
# to retrieve scores for a linkset
72
while (my $linkset = $elink->next_linkset) {
73
my @score_dbs = $linkset->has_scores; # retrieve databases with score values
74
for my $db (@score_dbs) {
75
my @ids = $linkset->get_LinkIds_by_db($db); #returns primary ID's
76
$linkset->set_score_db($db); # to current database containing scores
78
my $score = get_score($id);
79
# do something here, like screen for IDs based on score
84
# or just receive a hash containing ID-score key-value pairs
86
while (my $linkset = $elink->next_linkset) {
87
my @score_dbs = $linkset->has_scores;
88
for my $db (@score_dbs) {
89
$linkset->set_score_db($db);
90
%scores = $linkset->get_score_hash;
96
B<WARNING>: Please do B<NOT> spam the Entrez web server with multiple requests.
98
The EUtility Elink is used to check for and retrieve external or related ID's
99
from a list of one or more primary ID's. Using the C<cmd> parameter, one can
100
vary the returned data. See the below command options for explanations on
101
returned XML output. For certain command options one can retrieve one or more
102
L<Bio::DB::EUtilities::Cookie|Bio::DB::EUtilities::Cookie> objects to be used in
103
other EUtility searches or efetch primary IDs. Other will return the ID
104
information and relevancy scores in one or more
105
L<Bio::DB::EUtilities::ElinkData|Bio::DB::EUtilities::ElinkData> objects.
107
=head2 NCBI ELink Parameters
109
The following are a general list of parameters that can be used to take
110
advantage of ELink. Up-to-date help for ELink is available at this URL
111
(the information below is a summary of the options found there):
113
http://eutils.ncbi.nlm.nih.gov/entrez/query/static/elink_help.html
119
One or more database available through EUtilities. If set to 'all', will
120
retrieve all relevant information from each database based on the C<cmd>
121
parameter (the default setting is to retrieve related primary ID's). One
122
interesting behaviour is when C<db> and C<dbfrom> are set to the same database;
123
related IDs from database are retrieved along with a relevancy score. This
124
score differs from database to database; if protein-protein elinks are sought,
125
the scores are generated from BLASTP
129
originating database; useful only if using directly when querying with ID's
133
a list of primary ID's
135
Below are a list of IDs which can be used with ELink:
137
B<PMID> (pubmed), B<MIM number> (omim), B<GI number> (nucleotide, protein),
138
B<Genome ID> (genome), B<Popset ID> (popset), B<SNP cluster ID> (snp),
139
B<UniSTS ID> (unists), B<UniGene cluster ID> (unigene), B<MMDB-ID> (structure),
140
B<PSSM-ID> (cdd), B<3D SDI> (domains), B<TAXID> (taxonomy), B<GEO ID> (geo)
144
limits results to the number of days preceding today's date
146
=item C<mindate>, C<maxdate>
148
limits results by dates (C<yyyy/mm/dd> format, or by year)
152
limits results by Entrez query (only valid when C<cmd=neighbor> within a single
157
set to XML, but can be changed to ref when needed
161
command values (see below)
165
list LinkOut URLs for specified holding provider; used with C<cmd=llinks>
166
or C<cmd=llinkslib> (rarely used)
170
=head2 Additional (Bioperl-related) Parameters
172
The following are a general list of parameters that can be used to take
173
advantage of ELink. Up-to-date help for ELink is available at this URL
174
(the information below is a summary of the options found there):
176
http://eutils.ncbi.nlm.nih.gov/entrez/query/static/elink_help.html
182
The relevant EUtility to be used (elink).
186
Uses a L<Cookie|Bio::DB::EUtilities::Cookie>-based search (see below)
190
Sets a flag to treat the ID data (C<id> parameter) as multiple ID groups (see
193
=item C<keep_cookies>
195
Sets a flag to retain the cookie queue (this is normally cleared
200
=head2 Command Values
202
Command values are set using the C<cmd> parameter.
208
List the hyperlink to the primary LinkOut provider for multiple IDs and
209
database. Each ID is processed separately.
211
=item C<prlinks&retmode=ref>
213
Create a hyperlink to the primary LinkOut provider for a single ID and database.
217
List LinkOut URLs and Attributes, except PubMed libraries, for multiple IDs
218
and database. Each ID is processed separately.
222
List LinkOut URLs and Attributes for multiple IDs and database. Each ID is
223
processed separately.
227
Check for the existence (Y or N) of an external link in for multiple IDs and
232
Check for the existence of a neighbor link for each ID within a database,
233
e.g., Related Articles in PubMed.
237
The default setting. Display neighbors and their scores within a database.
238
This module will parse XML output from an ELink query and will return a
239
L<Bio::DB::EUtilities::ElinkData> object, which contains IDs for every database
240
liked to using C<db> (see C<id> and C<db> for more details).
242
=item C<neighbor_history>
244
Create history (WebEnv & query_key) for use in other EUtilities.
248
Lists Entrez databases links for multiple IDs from a single database.
254
Some EUtilities (C<epost>, C<esearch>, or C<elink>) are able to retain information on
255
the NCBI server under certain settings. This information can be retrieved by
256
using a B<cookie>. Here, the idea of the 'cookie' is similar to the 'cookie' set
257
on a user's computer when browsing the Web. XML data returned by these
258
EUtilities, when applicable, is parsed for the cookie information (the 'WebEnv'
259
and 'query_key' tags to be specific) The information along with other identifying
260
data, such as the calling eutility, description of query, etc.) is stored as a
261
L<Bio::DB::EUtilities::Cookie|Bio::DB::EUtilities::Cookie> object in an internal
262
queue. These can be retrieved one at a time by using the next_cookie method or
263
all at once in an array using get_all_cookies. Each cookie can then be 'fed',
264
one at a time, to another EUtility object, thus enabling chained queries as
265
demonstrated in the synopsis.
267
For more information, see the POD documentation for
268
L<Bio::DB::EUtilities::Cookie|Bio::DB::EUtilities::Cookie>.
270
=head2 ElinkData Objects
272
Due to the diversity of information that can be returned via elink, a special
273
object (ElinkData) has been created to hold data parsed from the XML output. This
274
object holds returned IDs, scores, and potentially additional data as the need
275
arises. ElinkData objects are stored in an internal queue much like for Cookie
276
objects; similarly, they can be accessed using L<next_linkset> and
277
L<get_all_linksets>. If a simple search is initiated, where one database is
278
queried using one set of IDs, the default EUtilities method C<get_ids> can be
279
used to retrieve the IDs. If more than one database is specified for a single
280
set of IDs, (such as when C<db> is set to 'all' or a comma-separated list, like
281
'protein,taxonomy'), the database must be passed explicitly to C<get_ids> as an
282
argument to retrieve the relevant IDs.
284
The most complicated sitation comes when using multiple ID groups (see below).
285
This requires that each ID group have a separate set of data (a linkset), each
286
with potential multiple databases, multiple IDs, and so on. Linkset data is
287
stored in a special object
288
(L<Bio::DB::EUtilities::ElinkData|Bio::DB::EUtilities::ElinkData>).
290
For more information, see the POD documentation for
291
L<Bio::DB::EUtilities::ElinkData|Bio::DB::EUtilities::ElinkData>.
295
=head2 Complex queries
297
Chaining queries for retrieving related data using elink and other EUtilities is
298
now possible (see the L</"SYNOPSIS"> for an example). For instance, one can
299
grab a large number of taxon IDs using protein/nucleotide IDs; these can be
300
retrieved directly or saved on the server (setting C<cmd> to 'neighbor_history'),
301
and the cookie passed on to efetch.
303
=head2 Retrieving relevancy scores
305
When the C<db> and C<dbfrom> parameters are set to the same database, one can
306
retrieve relevancy scores for a single ID. These are based on several different
307
factors. For proteins, they are precomputed BLASTP scores, so this is actually
308
a quick way to get the best hits without having to run BLASTP directly!
309
Similarly, scores returned for nucleotide-nucleotide are based on BLASTN scores.
311
=head2 Multiple ID groups
313
When C<multi_id> flag is set to a TRUE value, the id list is built based on
314
different set of factors. The default method for submitting an ID list for
315
a query request for any EUtility is by having the C<id> parameter set to
316
an array reference (multiple IDs) or pass a single ID as a scalar, like this:
321
L<Bio::DB::EUtilities::elink|Bio::DB::EUtilities::elink> has the additional
322
capability to submit ID groups where searches are performed on each ID group
323
independently. This is accomplished by setting the C<multi_id> flag to true,
324
which indicates that the ID list will be evaluated as an array reference, with
325
each ID group represented by another array reference or a single ID. So, with
326
C<multi_id> set to TRUE:
328
-id => \@ids, # evaluates each ID in the array independently
330
-id => [@ids], # same as above
332
-id => [\@ids, $id], # IDs in @ids are grouped together for one search
333
# while single ID in scalar is searched independently
337
-id => [\@ids, $id1, @ids2], # @ids ID grouped together; IDs in $id1 and @id2
338
# are flattened and evaluated independently
340
This enables one-to-one correspondence with the returned data, so that one
341
can determine, per ID, what the matching ELink ID is. The default is to
342
return them all as a group (no one-to-one correspondence). Using a small ID
343
array, C<multi_id> set to TRUE, '-id =E<gt> \@ids', and this loop:
345
while (my $linkset = $elink->next_linkset) {
346
print "Query ID : ",join q(,), $linkset->query_id,"\n";
347
print "\tTax ID : ",join q(,), $linkset->get_LinkIds_by_db('taxonomy'),"\n";
359
Setting C<multi_id> to FALSE or not setting, using all other conditions above,
362
Query ID : 31792573,31618162,1621261,
363
Tax ID : 233413,83332,
369
User feedback is an integral part of the
370
evolution of this and other Bioperl modules. Send
371
your comments and suggestions preferably to one
372
of the Bioperl mailing lists. Your participation
375
bioperl-l@lists.open-bio.org - General discussion
376
http://www.bioperl.org/wiki/Mailing_lists - About the mailing lists
378
=head2 Reporting Bugs
380
Report bugs to the Bioperl bug tracking system to
381
help us keep track the bugs and their resolution.
382
Bug reports can be submitted via the web.
384
http://bugzilla.open-bio.org/
388
Email cjfields at uiuc dot edu
392
The rest of the documentation details each of the
393
object methods. Internal methods are usually
398
# Let the code begin...
400
package Bio::DB::EUtilities::elink;
405
use Bio::DB::EUtilities::Cookie;
406
use Bio::DB::EUtilities::ElinkData;
410
use base qw(Bio::DB::EUtilities);
412
our $EUTIL = 'elink';
413
our $DTDVERSION = '1';
414
# cmd parameter options; these haven't been mapped yet
416
our %CMD = ('prlinks' => 1,
422
'neighbor_history' => 1,
427
my ($self, @args ) = @_;
428
$self->SUPER::_initialize(@args);
429
my ($term, $field, $reldate, $mindate, $maxdate, $datetype, $multi_id, $retstart,
430
$retmax, $report, $dbfrom, $cmd, $holding, $version, $retmode, $linkname) =
431
$self->_rearrange([qw(TERM FIELD RELDATE MINDATE MAXDATE DATETYPE MULTI_ID
432
RETSTART RETMAX REPORT DBFROM CMD HOLDING VERSION LINKNAME)], @args);
434
$self->_eutil($EUTIL);
435
# defaults which can be overridden
436
# Note : retmode should be 'xml' for all elink queries except when cmd=prlinks
437
$datetype ||= 'mdat';
438
$self->datetype($datetype);
439
$version ||= $DTDVERSION; # DTD to use, should leave alone
440
$self->version($version);
442
$term && $self->term($term);
443
$field && $self->field($field);
444
$reldate && $self->reldate($reldate);
445
$mindate && $self->mindate($mindate);
446
$maxdate && $self->maxdate($maxdate);
447
$retstart && $self->retstart($retstart);
448
$retmax && $self->retmax($retmax);
449
$report && $self->report($report);
450
$dbfrom && $self->dbfrom($dbfrom);
451
# validate cmd, otherwise don't set
452
$cmd && exists $CMD{$cmd} && $self->cmd($cmd);
453
$holding && $self->holding($holding);
454
$linkname && $self->linkname($linkname);
455
$multi_id && $self->multi_id($multi_id);
456
$self->{'_linksetindex'} = 0;
457
$self->{'_linksets'} = [];
458
$self->{'_ls_ct'} = 0;
461
=head2 parse_response
463
Title : parse_response
464
Usage : $elink->parse_response($content)
465
Function: parse out response for cookie and/or id's
467
Args : HTTP::Response object
468
Throws : 'NCBI elink nonrecoverable error'
474
# to add: parsing for dbfrom/dbto ids, tagging cookies with databases
476
my $response = shift if @_;
477
if (!$response || !$response->isa("HTTP::Response")) {
478
$self->throw("Need HTTP::Response object");
480
my $xs = XML::Simple->new();
481
my $simple = $xs->XMLin($response->content,
482
forcearray => [qw(LinkSet LinkSetDb LinkSetDbHistory Link)]);
484
if (exists $simple->{ERROR}) {
485
$self->throw("NCBI elink nonrecoverable error: ".$simple->{ERROR});
487
#$self->debug("Response dumper:\n".Dumper($simple));
488
my $cmd = $self->cmd ? $self->cmd : 'neighbor'; # set default cmd
489
# process possible cookies first
490
if (defined($cmd) && $cmd eq 'neighbor_history') {
491
# process each LinkSet hash, one at at time;
492
# No scores when using history (only ids)
493
if (!exists $simple->{LinkSet} ) {
494
$self->warn('No link history');
496
for my $linkset (@{ $simple->{LinkSet} }) {
497
my $webenv = $linkset->{WebEnv};
498
my $dbfrom = $linkset->{DbFrom};
499
my $from_ids = $linkset->{IdList}->{Id};
500
if (!ref($from_ids)) {
504
for my $history (@{ $linkset->{LinkSetDbHistory} }) {
505
my $query_key = $history->{QueryKey};
506
next if (!$query_key || (exists $history->{Info} eq 'Empty result') );
507
my $lname = $history->{LinkName};
508
my $db = $history->{DbTo};
509
my $cookie = Bio::DB::EUtilities::Cookie->new(
510
-verbose => $self->verbose,
512
-querykey => $query_key,
516
-query_id => $from_ids,
519
$self->add_cookie($cookie);
524
elsif ($cmd eq 'neighbor' || !$cmd) {
525
if (!exists $simple->{LinkSet}) {
526
$self->warn('No returned links.');
529
for my $linkset (@{ $simple->{LinkSet} }) {
530
my $linkobj = Bio::DB::EUtilities::ElinkData->new
531
(-verbose => $self->verbose,
533
my $status = $linkobj->_add_set($linkset);
534
$self->_add_linkset($linkobj) if $status;
537
$self->debug("$cmd not yet supported; no parsing occurred");
539
# need to add a few things for cmd=llinks
546
Usage : $elink->multi_id(1);
547
Function: gets/sets value (switch for using multiple ids)
548
Returns : Boolean (value evaluating to true or false)
549
Args : Boolean (value evaluating to true or false)
555
return $self->{'_multi_id'} = shift if @_;
556
return $self->{'_multi_id'};
562
Usage : $ls = $elink->next_linkset;
563
Function: returns next linkset in internal cache of
564
: Bio::DB::EUtilities::ElinkData objects
565
Returns : Boolean (value evaluating to true or false)
566
Args : Boolean (value evaluating to true or false)
572
my $index = $self->_next_linkset_index;
573
return if ($index > scalar($self->{'_linksets'}));
574
return $self->{'_linksets'}->[$index] ;
577
=head2 get_all_linksets
579
Title : get_all_linksets
580
Usage : @ls = $elink->get_all_linksets;
581
Function: returns array of Bio::DB::EUtilities::ElinkData objects
582
Returns : array or array ref of Bio::DB::EUtilities::ElinkData objects
588
sub get_all_linksets {
590
return @{ $self->{'_linksets'} } if wantarray;
591
return $self->{'_linksets'};
594
=head2 reset_linksets
596
Title : reset_linksets
597
Usage : $elink->reset_linksets;
598
Function: resets (empties) internal cache of Linkset objects
606
$self->{'_linksets'} = [];
607
$self->rewind_linksets;
608
$self->{'_ls_ct'} = 0;
611
=head2 rewind_linksets
613
Title : rewind_linksets
614
Usage : $elink->rewind_linksets;
615
Function: resets linkset index to 0 (starts over)
623
$self->{'_linksetindex'} = 0;
626
=head2 get_linkset_count
628
Title : get_linkset_count
629
Usage : $ct = $elink->get_linkset_count;
630
Function: returns total # of linksets in Elink object
631
Returns : Integer (# linksets)
636
sub get_linkset_count {
638
return $self->{'_ls_ct'};
641
# holds and changes linkset index for next_linkset
643
sub _next_linkset_index {
645
return $self->{'_linksetindex'}++;
648
# private method : parse linkset data and add ElinkData objects to linkset cache
653
my $data_links = shift;
654
$self->throw("Expecting a Bio::DB::EUtilities::ElinkData, got $data_links.")
655
unless $data_links->isa("Bio::DB::EUtilities::ElinkData");
656
push @{ $self->{'_linksets'} }, $data_links;