1
/* Handling of recursive HTTP retrieving.
2
Copyright (C) 1995, 1996, 1997, 2000, 2001 Free Software Foundation, Inc.
4
This file is part of GNU Wget.
6
GNU Wget is free software; you can redistribute it and/or modify
7
it under the terms of the GNU General Public License as published by
8
the Free Software Foundation; either version 2 of the License, or
9
(at your option) any later version.
11
GNU Wget is distributed in the hope that it will be useful,
12
but WITHOUT ANY WARRANTY; without even the implied warranty of
13
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
GNU General Public License for more details.
16
You should have received a copy of the GNU General Public License
17
along with Wget; if not, write to the Free Software
18
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20
In addition, as a special exception, the Free Software Foundation
21
gives permission to link the code of its release of Wget with the
22
OpenSSL project's "OpenSSL" library (or with modified versions of it
23
that use the same license as the "OpenSSL" library), and distribute
24
the linked executables. You must obey the GNU General Public License
25
in all respects for all of the code used other than "OpenSSL". If you
26
modify this file, you may extend this exception to your version of the
27
file, but you are not obligated to do so. If you do not wish to do
28
so, delete this exception statement from your version. */
38
#endif /* HAVE_STRING_H */
41
#endif /* HAVE_UNISTD_H */
44
#include <sys/types.h>
61
extern char *version_string;
62
extern LARGE_INT total_downloaded_bytes;
64
extern struct hash_table *dl_url_file_map;
65
extern struct hash_table *downloaded_html_set;
67
/* Functions for maintaining the URL queue. */
69
struct queue_element {
70
const char *url; /* the URL to download */
71
const char *referer; /* the referring document */
72
int depth; /* the depth */
73
unsigned int html_allowed :1; /* whether the document is allowed to
74
be treated as HTML. */
76
struct queue_element *next; /* next element in queue */
80
struct queue_element *head;
81
struct queue_element *tail;
85
/* Create a URL queue. */
87
static struct url_queue *
90
struct url_queue *queue = xmalloc (sizeof (*queue));
91
memset (queue, '\0', sizeof (*queue));
95
/* Delete a URL queue. */
98
url_queue_delete (struct url_queue *queue)
103
/* Enqueue a URL in the queue. The queue is FIFO: the items will be
104
retrieved ("dequeued") from the queue in the order they were placed
108
url_enqueue (struct url_queue *queue,
109
const char *url, const char *referer, int depth, int html_allowed)
111
struct queue_element *qel = xmalloc (sizeof (*qel));
113
qel->referer = referer;
115
qel->html_allowed = html_allowed;
119
if (queue->count > queue->maxcount)
120
queue->maxcount = queue->count;
122
DEBUGP (("Enqueuing %s at depth %d\n", url, depth));
123
DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
126
queue->tail->next = qel;
130
queue->head = queue->tail;
133
/* Take a URL out of the queue. Return 1 if this operation succeeded,
134
or 0 if the queue is empty. */
137
url_dequeue (struct url_queue *queue,
138
const char **url, const char **referer, int *depth,
141
struct queue_element *qel = queue->head;
146
queue->head = queue->head->next;
151
*referer = qel->referer;
153
*html_allowed = qel->html_allowed;
157
DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));
158
DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
164
static int download_child_p PARAMS ((const struct urlpos *, struct url *, int,
165
struct url *, struct hash_table *));
166
static int descend_redirect_p PARAMS ((const char *, const char *, int,
167
struct url *, struct hash_table *));
170
/* Retrieve a part of the web beginning with START_URL. This used to
171
be called "recursive retrieval", because the old function was
172
recursive and implemented depth-first search. retrieve_tree on the
173
other hand implements breadth-search traversal of the tree, which
174
results in much nicer ordering of downloads.
176
The algorithm this function uses is simple:
178
1. put START_URL in the queue.
179
2. while there are URLs in the queue:
181
3. get next URL from the queue.
183
5. if the URL is HTML and its depth does not exceed maximum depth,
184
get the list of URLs embedded therein.
185
6. for each of those URLs do the following:
187
7. if the URL is not one of those downloaded before, and if it
188
satisfies the criteria specified by the various command-line
189
options, add it to the queue. */
192
retrieve_tree (const char *start_url)
194
uerr_t status = RETROK;
196
/* The queue of URLs we need to load. */
197
struct url_queue *queue;
199
/* The URLs we do not wish to enqueue, because they are already in
200
the queue, but haven't been downloaded yet. */
201
struct hash_table *blacklist;
204
struct url *start_url_parsed = url_parse (start_url, &up_error_code);
206
if (!start_url_parsed)
208
logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,
209
url_error (up_error_code));
213
queue = url_queue_new ();
214
blacklist = make_string_hash_table (0);
216
/* Enqueue the starting URL. Use start_url_parsed->url rather than
217
just URL so we enqueue the canonical form of the URL. */
218
url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, 1);
219
string_set_add (blacklist, start_url_parsed->url);
224
char *url, *referer, *file = NULL;
225
int depth, html_allowed;
226
boolean dash_p_leaf_HTML = FALSE;
228
if (opt.quota && total_downloaded_bytes > opt.quota)
230
if (status == FWRITEERR)
233
/* Get the next URL from the queue... */
235
if (!url_dequeue (queue,
236
(const char **)&url, (const char **)&referer,
237
&depth, &html_allowed))
240
/* ...and download it. Note that this download is in most cases
241
unconditional, as download_child_p already makes sure a file
242
doesn't get enqueued twice -- and yet this check is here, and
243
not in download_child_p. This is so that if you run `wget -r
244
URL1 URL2', and a random URL is encountered once under URL1
245
and again under URL2, but at a different (possibly smaller)
246
depth, we want the URL's children to be taken into account
248
if (dl_url_file_map && hash_table_contains (dl_url_file_map, url))
250
file = xstrdup (hash_table_get (dl_url_file_map, url));
252
DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
256
&& downloaded_html_set
257
&& string_set_contains (downloaded_html_set, file))
263
char *redirected = NULL;
264
int oldrec = opt.recursive;
267
status = retrieve_url (url, &file, &redirected, referer, &dt);
268
opt.recursive = oldrec;
270
if (html_allowed && file && status == RETROK
271
&& (dt & RETROKF) && (dt & TEXTHTML))
276
/* We have been redirected, possibly to another host, or
277
different path, or wherever. Check whether we really
278
want to follow it. */
281
if (!descend_redirect_p (redirected, url, depth,
282
start_url_parsed, blacklist))
285
/* Make sure that the old pre-redirect form gets
287
string_set_add (blacklist, url);
296
&& depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
298
if (opt.page_requisites
299
&& (depth == opt.reclevel || depth == opt.reclevel + 1))
301
/* When -p is specified, we are allowed to exceed the
302
maximum depth, but only for the "inline" links,
303
i.e. those that are needed to display the page.
304
Originally this could exceed the depth at most by
305
one, but we allow one more level so that the leaf
306
pages that contain frames can be loaded
308
dash_p_leaf_HTML = TRUE;
312
/* Either -p wasn't specified or it was and we've
313
already spent the two extra (pseudo-)levels that it
314
affords us, so we need to bail out. */
315
DEBUGP (("Not descending further; at depth %d, max. %d.\n",
316
depth, opt.reclevel));
321
/* If the downloaded document was HTML, parse it and enqueue the
322
links it contains. */
326
int meta_disallow_follow = 0;
327
struct urlpos *children
328
= get_urls_html (file, url, &meta_disallow_follow);
330
if (opt.use_robots && meta_disallow_follow)
332
free_urlpos (children);
338
struct urlpos *child = children;
339
struct url *url_parsed = url_parsed = url_parse (url, NULL);
340
assert (url_parsed != NULL);
342
for (; child; child = child->next)
344
if (child->ignore_when_downloading)
346
if (dash_p_leaf_HTML && !child->link_inline_p)
348
if (download_child_p (child, url_parsed, depth, start_url_parsed,
351
url_enqueue (queue, xstrdup (child->url->url),
352
xstrdup (url), depth + 1,
353
child->link_expect_html);
354
/* We blacklist the URL we have enqueued, because we
355
don't want to enqueue (and hence download) the
357
string_set_add (blacklist, child->url->url);
361
url_free (url_parsed);
362
free_urlpos (children);
366
if (opt.delete_after || (file && !acceptable (file)))
368
/* Either --delete-after was specified, or we loaded this
369
otherwise rejected (e.g. by -R) HTML file just so we
370
could harvest its hyperlinks -- in either case, delete
372
DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
373
opt.delete_after ? "--delete-after" :
374
"recursive rejection criteria"));
375
logprintf (LOG_VERBOSE,
377
? _("Removing %s.\n")
378
: _("Removing %s since it should be rejected.\n")),
381
logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
382
register_delete_file (file);
386
FREE_MAYBE (referer);
390
/* If anything is left of the queue due to a premature exit, free it
395
while (url_dequeue (queue,
396
(const char **)&d1, (const char **)&d2, &d3, &d4))
402
url_queue_delete (queue);
404
if (start_url_parsed)
405
url_free (start_url_parsed);
406
string_set_free (blacklist);
408
if (opt.quota && total_downloaded_bytes > opt.quota)
410
else if (status == FWRITEERR)
416
/* Based on the context provided by retrieve_tree, decide whether a
417
URL is to be descended to. This is only ever called from
418
retrieve_tree, but is in a separate function for clarity.
420
The most expensive checks (such as those for robots) are memoized
421
by storing these URLs to BLACKLIST. This may or may not help. It
422
will help if those URLs are encountered many times. */
425
download_child_p (const struct urlpos *upos, struct url *parent, int depth,
426
struct url *start_url_parsed, struct hash_table *blacklist)
428
struct url *u = upos->url;
429
const char *url = u->url;
430
int u_scheme_like_http;
432
DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
434
if (string_set_contains (blacklist, url))
436
DEBUGP (("Already on the black list.\n"));
440
/* Several things to check for:
441
1. if scheme is not http, and we don't load it
442
2. check for relative links (if relative_only is set)
444
4. check for no-parent
445
5. check for excludes && includes
447
7. check for same host (if spanhost is unset), with possible
448
gethostbyname baggage
449
8. check for robots.txt
451
Addendum: If the URL is FTP, and it is to be loaded, only the
452
domain and suffix settings are "stronger".
454
Note that .html files will get loaded regardless of suffix rules
455
(but that is remedied later with unlink) unless the depth equals
458
More time- and memory- consuming tests should be put later on
461
/* Determine whether URL under consideration has a HTTP-like scheme. */
462
u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP);
464
/* 1. Schemes other than HTTP are normally not recursed into. */
465
if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp))
467
DEBUGP (("Not following non-HTTP schemes.\n"));
471
/* 2. If it is an absolute link and they are not followed, throw it
473
if (u_scheme_like_http)
474
if (opt.relative_only && !upos->link_relative_p)
476
DEBUGP (("It doesn't really look like a relative link.\n"));
480
/* 3. If its domain is not to be accepted/looked-up, chuck it
482
if (!accept_domain (u))
484
DEBUGP (("The domain was not accepted.\n"));
488
/* 4. Check for parent directory.
490
If we descended to a different host or changed the scheme, ignore
491
opt.no_parent. Also ignore it for documents needed to display
492
the parent page when in -p mode. */
494
&& schemes_are_similar_p (u->scheme, start_url_parsed->scheme)
495
&& 0 == strcasecmp (u->host, start_url_parsed->host)
496
&& u->port == start_url_parsed->port
497
&& !(opt.page_requisites && upos->link_inline_p))
499
if (!frontcmp (start_url_parsed->dir, u->dir))
501
DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
502
u->dir, start_url_parsed->dir));
507
/* 5. If the file does not match the acceptance list, or is on the
508
rejection list, chuck it out. The same goes for the directory
509
exclusion and inclusion lists. */
510
if (opt.includes || opt.excludes)
512
if (!accdir (u->dir, ALLABS))
514
DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
519
/* 6. Check for acceptance/rejection rules. We ignore these rules
520
for directories (no file name to match) and for HTML documents,
521
which might lead to other files that do need to be downloaded.
522
That is, unless we've exhausted the recursion depth anyway. */
523
if (u->file[0] != '\0'
524
&& !(has_html_suffix_p (u->file)
525
&& depth != INFINITE_RECURSION
526
&& depth < opt.reclevel - 1))
528
if (!acceptable (u->file))
530
DEBUGP (("%s (%s) does not match acc/rej rules.\n",
537
if (schemes_are_similar_p (u->scheme, parent->scheme))
538
if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
540
DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
541
u->host, parent->host));
546
if (opt.use_robots && u_scheme_like_http)
548
struct robot_specs *specs = res_get_specs (u->host, u->port);
552
if (res_retrieve_file (url, &rfile))
554
specs = res_parse_from_file (rfile);
559
/* If we cannot get real specs, at least produce
560
dummy ones so that we can register them and stop
561
trying to retrieve them. */
562
specs = res_parse ("", 0);
564
res_register_specs (u->host, u->port, specs);
567
/* Now that we have (or don't have) robots.txt specs, we can
568
check what they say. */
569
if (!res_match_path (specs, u->path))
571
DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
572
string_set_add (blacklist, url);
577
/* The URL has passed all the tests. It can be placed in the
579
DEBUGP (("Decided to load it.\n"));
584
DEBUGP (("Decided NOT to load it.\n"));
589
/* This function determines whether we will consider downloading the
590
children of a URL whose download resulted in a redirection,
591
possibly to another host, etc. It is needed very rarely, and thus
592
it is merely a simple-minded wrapper around download_child_p. */
595
descend_redirect_p (const char *redirected, const char *original, int depth,
596
struct url *start_url_parsed, struct hash_table *blacklist)
598
struct url *orig_parsed, *new_parsed;
602
orig_parsed = url_parse (original, NULL);
603
assert (orig_parsed != NULL);
605
new_parsed = url_parse (redirected, NULL);
606
assert (new_parsed != NULL);
608
upos = xmalloc (sizeof (struct urlpos));
609
memset (upos, 0, sizeof (*upos));
610
upos->url = new_parsed;
612
success = download_child_p (upos, orig_parsed, depth,
613
start_url_parsed, blacklist);
615
url_free (orig_parsed);
616
url_free (new_parsed);
620
DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));