98
97
use base qw(Bio::DB::WebDBSeqI Bio::Root::Root);
101
$MAX_ENTRIES = 19000;
102
$HOSTBASE = 'http://eutils.ncbi.nlm.nih.gov';
104
'batch' => ['post' => '/entrez/eutils/epost.fcgi'],
105
'query' => ['get' => '/entrez/eutils/efetch.fcgi'],
106
'single' => ['get' => '/entrez/eutils/efetch.fcgi'],
107
'version'=> ['get' => '/entrez/eutils/efetch.fcgi'],
108
'gi' => ['get' => '/entrez/eutils/efetch.fcgi'],
109
'webenv' => ['get' => '/entrez/eutils/efetch.fcgi']
112
%FORMATMAP = ( 'gb' => 'genbank',
115
'asn.1' => 'entrezgene',
116
'gbwithparts' => 'genbank',
118
$DEFAULTFORMAT = 'gb';
121
# the new way to make modules a little more lightweight
99
our $HOSTBASE = 'http://eutils.ncbi.nlm.nih.gov';
100
our $MAX_ENTRIES = 19000;
101
our $REQUEST_DELAY = 3;
103
'batch' => [ 'post' => '/entrez/eutils/epost.fcgi' ],
104
'query' => [ 'get' => '/entrez/eutils/efetch.fcgi' ],
105
'single' => [ 'get' => '/entrez/eutils/efetch.fcgi' ],
106
'version' => [ 'get' => '/entrez/eutils/efetch.fcgi' ],
107
'gi' => [ 'get' => '/entrez/eutils/efetch.fcgi' ],
108
'webenv' => [ 'get' => '/entrez/eutils/efetch.fcgi' ]
114
'asn.1' => 'entrezgene',
115
'gbwithparts' => 'genbank',
117
our $DEFAULTFORMAT = 'gb';
123
Function: the new way to make modules a little more lightweight
124
my ($class, @args ) = @_;
130
my ( $class, @args ) = @_;
125
131
my $self = $class->SUPER::new(@args);
126
my ($seq_start,$seq_stop,$no_redirect, $redirect, $complexity,$strand) =
127
$self->_rearrange([qw(SEQ_START SEQ_STOP NO_REDIRECT REDIRECT_REFSEQ COMPLEXITY STRAND)],
129
$seq_start && $self->seq_start($seq_start);
130
$seq_stop && $self->seq_stop($seq_stop);
131
$no_redirect && $self->no_redirect($no_redirect);
132
$redirect && $self->redirect_refseq($redirect);
133
$strand && $self->strand($strand);
134
# adjust statement to accept zero value
135
defined $complexity && ($complexity >=0 && $complexity <=4)
136
&& $self->complexity($complexity);
132
my ($seq_start, $seq_stop, $no_redirect,
133
$redirect, $complexity, $strand
136
[ qw(SEQ_START SEQ_STOP NO_REDIRECT REDIRECT_REFSEQ COMPLEXITY STRAND) ],
139
$seq_start && $self->seq_start($seq_start);
140
$seq_stop && $self->seq_stop($seq_stop);
141
$no_redirect && $self->no_redirect($no_redirect);
142
$redirect && $self->redirect_refseq($redirect);
143
$strand && $self->strand($strand);
145
# adjust statement to accept zero value
147
&& ( $complexity >= 0 && $complexity <= 4 )
148
&& $self->complexity($complexity);
181
193
sub get_request {
182
my ($self, @qualifiers) = @_;
183
my ($mode, $uids, $format, $query, $seq_start, $seq_stop, $strand, $complexity) =
184
$self->_rearrange([qw(MODE UIDS FORMAT QUERY SEQ_START SEQ_STOP STRAND COMPLEXITY)],
187
($format) = $self->request_format() unless ( defined $format);
188
if( !defined $mode || $mode eq '' ) { $mode = 'single'; }
189
my %params = $self->get_params($mode);
191
$self->throw("must specify a valid retrieval mode 'single' or 'batch' not '$mode'")
193
my $url = URI->new($HOSTBASE . $CGILOCATION{$mode}[1]);
194
unless( $mode eq 'webenv' || defined $uids || defined $query) {
195
$self->throw("Must specify a query or list of uids to fetch");
197
if ($query && $query->can('cookie')) {
198
@params{'WebEnv','query_key'} = $query->cookie;
199
$params{'db'} = $query->db;
202
$params{'id'} = join ',',$query->ids;
204
# for batch retrieval, non-query style
205
elsif ($mode eq 'webenv' && $self->can('cookie')) {
206
@params{'WebEnv','query_key'} = $self->cookie;
209
if( ref($uids) =~ /array/i ) {
210
$uids = join(",", @$uids);
212
$params{'id'} = $uids;
214
$seq_start && ($params{'seq_start'} = $seq_start);
215
$seq_stop && ($params{'seq_stop'} = $seq_stop);
216
$strand && ($params{'strand'} = $strand);
217
if (defined $complexity && ($seq_start || $seq_stop || $strand)) {
218
$self->warn("Complexity set to $complexity; seq_start and seq_stop may not work!")
219
if ($complexity != 1 && ($seq_start || $seq_stop));
220
$self->warn("Complexity set to 0; expect strange results with strand set to 2")
221
if ($complexity == 0 && $strand == 2 && $format eq 'fasta');
223
defined $complexity && ($params{'complexity'} = $complexity);
224
$params{'rettype'} = $format unless $mode eq 'batch';
225
# for now, 'post' is batch retrieval
226
if ($CGILOCATION{$mode}[0] eq 'post') {
227
my $response = $self->ua->request(POST $url,[%params]);
228
$response->proxy_authorization_basic($self->authentication)
229
if ( $self->authentication);
230
$self->_parse_response($response->content);
231
my ($cookie, $querykey) = $self->cookie;
232
my %qualifiers = ('-mode' => 'webenv',
233
'-seq_start' => $seq_start,
234
'-seq_stop' => $seq_stop,
235
'-strand' => $strand,
236
'-complexity' => $complexity,
237
'-format' => $format);
238
return $self->get_request(%qualifiers);
240
$url->query_form(%params);
194
my ( $self, @qualifiers ) = @_;
195
my ( $mode, $uids, $format, $query, $seq_start, $seq_stop, $strand,
198
[qw(MODE UIDS FORMAT QUERY SEQ_START SEQ_STOP STRAND COMPLEXITY)],
201
($format) = $self->request_format() unless ( defined $format );
202
if ( !defined $mode || $mode eq '' ) { $mode = 'single'; }
203
my %params = $self->get_params($mode);
206
"must specify a valid retrieval mode 'single' or 'batch' not '$mode'"
209
my $url = URI->new( $HOSTBASE . $CGILOCATION{$mode}[1] );
210
unless ( $mode eq 'webenv' || defined $uids || defined $query ) {
211
$self->throw("Must specify a query or list of uids to fetch");
213
if ( $query && $query->can('cookie') ) {
214
@params{ 'WebEnv', 'query_key' } = $query->cookie;
215
$params{'db'} = $query->db;
218
$params{'id'} = join ',', $query->ids;
221
# for batch retrieval, non-query style
222
elsif ( $mode eq 'webenv' && $self->can('cookie') ) {
223
@params{ 'WebEnv', 'query_key' } = $self->cookie;
226
if ( ref($uids) =~ /array/i ) {
227
$uids = join( ",", @$uids );
229
$params{'id'} = $uids;
231
$seq_start && ( $params{'seq_start'} = $seq_start );
232
$seq_stop && ( $params{'seq_stop'} = $seq_stop );
233
$strand && ( $params{'strand'} = $strand );
234
if ( defined $complexity && ( $seq_start || $seq_stop || $strand ) ) {
236
"Complexity set to $complexity; seq_start and seq_stop may not work!"
237
) if ( $complexity != 1 && ( $seq_start || $seq_stop ) );
239
"Complexity set to 0; expect strange results with strand set to 2"
240
) if ( $complexity == 0 && $strand == 2 && $format eq 'fasta' );
242
defined $complexity && ( $params{'complexity'} = $complexity );
243
$params{'rettype'} = $format unless $mode eq 'batch';
245
# for now, 'post' is batch retrieval
246
if ( $CGILOCATION{$mode}[0] eq 'post' ) {
247
my $response = $self->ua->request( POST $url, [%params] );
248
$response->proxy_authorization_basic( $self->authentication )
249
if ( $self->authentication );
250
$self->_parse_response( $response->content );
251
my ( $cookie, $querykey ) = $self->cookie;
254
'-seq_start' => $seq_start,
255
'-seq_stop' => $seq_stop,
256
'-strand' => $strand,
257
'-complexity' => $complexity,
260
return $self->get_request(%qualifiers);
263
$url->query_form(%params);
245
269
=head2 get_Stream_by_batch
247
271
Title : get_Stream_by_batch
248
272
Usage : $seq = $db->get_Stream_by_batch($ref);
249
273
Function: Retrieves Seq objects from Entrez 'en masse', rather than one
250
274
at a time. For large numbers of sequences, this is far superior
251
than get_Stream_by_[id/acc]().
275
than get_Stream_by_id or get_Stream_by_acc.
253
277
Returns : a Bio::SeqIO stream object
254
278
Args : $ref : either an array reference, a filename, or a filehandle
255
279
from which to get the list of unique ids/accession numbers.
257
NOTE: deprecated API. Use get_Stream_by_id() instead.
281
NOTE: deprecated API. Use get_Stream_by_id() instead.
366
391
Returns : value from 0-4 indicating level of complexity
367
392
Args : value from 0-4 (optional); if unset server assumes 1
368
393
Throws : if arg is not an integer or falls outside of noted range above
369
Note : From efetch docs:
371
Complexity regulates the display:
373
* 0 - get the whole blob
374
* 1 - get the bioseq for gi of interest (default in Entrez)
375
* 2 - get the minimal bioseq-set containing the gi of interest
376
* 3 - get the minimal nuc-prot containing the gi of interest
377
* 4 - get the minimal pub-set containing the gi of interest
394
Note : From efetch docs, the complexity regulates the display:
396
0 - get the whole blob
397
1 - get the bioseq for gi of interest (default in Entrez)
398
2 - get the minimal bioseq-set containing the gi of interest
399
3 - get the minimal nuc-prot containing the gi of interest
400
4 - get the minimal pub-set containing the gi of interest
382
my ($self, $comp) = @_;
384
$self->throw("Complexity value must be integer between 0 and 4") if
385
$comp !~ /^\d+$/ || $comp < 0 || $comp > 4;
405
my ( $self, $comp ) = @_;
406
if ( defined $comp ) {
407
$self->throw("Complexity value must be integer between 0 and 4")
408
if $comp !~ /^\d+$/ || $comp < 0 || $comp > 4;
386
409
$self->{'_complexity'} = $comp;
388
411
return $self->{'_complexity'};
472
495
sub get_Stream_by_acc {
473
my ($self, $ids ) = @_;
496
my ( $self, $ids ) = @_;
474
497
my $newdb = $self->_check_id($ids);
475
if (defined $newdb && ref($newdb) && $newdb->isa('Bio::DB::RefSeq')) {
476
return $newdb->get_seq_stream('-uids' => $ids, '-mode' => 'single');
478
return $self->get_seq_stream('-uids' => $ids, '-mode' => 'single');
498
if ( defined $newdb && ref($newdb) && $newdb->isa('Bio::DB::RefSeq') ) {
499
return $newdb->get_seq_stream( '-uids' => $ids, '-mode' => 'single' );
502
return $self->get_seq_stream( '-uids' => $ids, '-mode' => 'single' );
485
508
Title : _check_id
488
Returns : A Bio::DB::RefSeq reference or throws
511
Returns : a Bio::DB::RefSeq reference or throws
489
512
Args : $id(s), $string
494
my ($self, $ids) = @_;
496
# NT contigs can not be retrieved
497
$self->throw("NT_ contigs are whole chromosome files which are not part of regular".
498
"database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.")
501
# Asking for a RefSeq from EMBL/GenBank
503
if ($self->redirect_refseq) {
505
$self->warn("[$ids] is not a normal sequence database but a RefSeq entry.".
506
" Redirecting the request.\n")
507
if $self->verbose >= 0;
508
return Bio::DB::RefSeq->new();
517
my ( $self, $ids ) = @_;
519
# NT contigs can not be retrieved
520
$self->throw("NT_ contigs are whole chromosome files which are not part of regular"
521
. "database distributions. Go to ftp://ftp.ncbi.nih.gov/genomes/.")
524
# Asking for a RefSeq from EMBL/GenBank
525
if ( $self->redirect_refseq ) {
526
if ( $ids =~ /N._/ ) {
528
"[$ids] is not a normal sequence database but a RefSeq entry."
529
. " Redirecting the request.\n" )
530
if $self->verbose >= 0;
531
return Bio::DB::RefSeq->new();
513
537
=head2 delay_policy
515
539
Title : delay_policy
516
540
Usage : $secs = $self->delay_policy
517
Function: return number of seconds to delay between calls to remote db
541
Function: NCBI requests a delay of 3 seconds between requests. This method
542
implements that policy.
518
543
Returns : number of seconds to delay
521
NOTE: NCBI requests a delay of 3 seconds between requests. This method
522
implements that policy.
526
548
sub delay_policy {
550
return $REQUEST_DELAY;
534
556
Usage : ($cookie,$querynum) = $db->cookie
535
Function: return the NCBI query cookie
557
Function: return the NCBI query cookie, this information is used by
558
Bio::DB::GenBank in conjunction with efetch, ripped from
559
Bio::DB::Query::GenBank
536
560
Returns : list of (cookie,querynum)
539
NOTE: this information is used by Bio::DB::GenBank in
540
conjunction with efetch.
544
# ripped from Bio::DB::Query::GenBank
548
$self->{'_cookie'} = shift;
549
$self->{'_querynum'} = shift;
552
return @{$self}{qw(_cookie _querynum)};
568
$self->{'_cookie'} = shift;
569
$self->{'_querynum'} = shift;
572
return @{$self}{qw(_cookie _querynum)};
556
576
=head2 _parse_response
558
578
Title : _parse_response
559
579
Usage : $db->_parse_response($content)
560
Function: parse out response for cookie
580
Function: parse out response for cookie, this is a trimmed-down version
581
of _parse_response from Bio::DB::Query::GenBank
563
584
Throws : 'unparseable output exception'
567
# trimmed-down version of _parse_response from Bio::DB::Query::GenBank
568
588
sub _parse_response {
571
if (my ($warning) = $content =~ m!<ErrorList>(.+)</ErrorList>!s) {
572
$self->warn("Warning(s) from GenBank: $warning\n");
574
if (my ($error) = $content =~ /<OutputMessage>([^<]+)/) {
575
$self->throw("Error from Genbank: $error");
577
my ($cookie) = $content =~ m!<WebEnv>(\S+)</WebEnv>!;
578
my ($querykey) = $content =~ m!<QueryKey>(\d+)!;
579
$self->cookie(uri_unescape($cookie),$querykey);
591
if ( my ($warning) = $content =~ m!<ErrorList>(.+)</ErrorList>!s ) {
592
$self->warn("Warning(s) from GenBank: $warning\n");
594
if ( my ($error) = $content =~ /<OutputMessage>([^<]+)/ ) {
595
$self->throw("Error from Genbank: $error");
597
my ($cookie) = $content =~ m!<WebEnv>(\S+)</WebEnv>!;
598
my ($querykey) = $content =~ m!<QueryKey>(\d+)!;
599
$self->cookie( uri_unescape($cookie), $querykey );
582
########### DEPRECATED!!!! ###########
584
602
=head2 no_redirect
586
604
Title : no_redirect
587
605
Usage : $db->no_redirect($content)
588
Function: Used to indicate that Bio::DB::GenBank instance retrieves
606
Function: DEPRECATED - Used to indicate that Bio::DB::GenBank instance retrieves
589
607
possible RefSeqs from EBI instead; default behavior is now to
590
608
retrieve directly from NCBI