14
15
GNU General Public License for more details.
16
17
You should have received a copy of the GNU General Public License
17
along with Wget; if not, write to the Free Software
18
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20
In addition, as a special exception, the Free Software Foundation
21
gives permission to link the code of its release of Wget with the
22
OpenSSL project's "OpenSSL" library (or with modified versions of it
23
that use the same license as the "OpenSSL" library), and distribute
24
the linked executables. You must obey the GNU General Public License
25
in all respects for all of the code used other than "OpenSSL". If you
26
modify this file, you may extend this exception to your version of the
27
file, but you are not obligated to do so. If you do not wish to do
28
so, delete this exception statement from your version. */
18
along with Wget. If not, see <http://www.gnu.org/licenses/>.
20
Additional permission under GNU GPL version 3 section 7
22
If you modify this program, or any covered work, by linking or
23
combining it with the OpenSSL project's OpenSSL library (or a
24
modified version of that library), containing parts covered by the
25
terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26
grants you additional permission to convey the resulting work.
27
Corresponding Source for a non-source form of such a combination
28
shall include the source code for the parts of OpenSSL used as well
29
as that of the covered work. */
30
31
#include <config.h>
38
35
#include <stdlib.h>
40
37
#include <assert.h>
98
90
tag_handler_t handler;
100
{ TAG_A, "a", tag_find_urls },
101
{ TAG_APPLET, "applet", tag_find_urls },
102
{ TAG_AREA, "area", tag_find_urls },
103
{ TAG_BASE, "base", tag_handle_base },
104
{ TAG_BGSOUND, "bgsound", tag_find_urls },
105
{ TAG_BODY, "body", tag_find_urls },
106
{ TAG_EMBED, "embed", tag_find_urls },
107
{ TAG_FIG, "fig", tag_find_urls },
108
{ TAG_FORM, "form", tag_handle_form },
109
{ TAG_FRAME, "frame", tag_find_urls },
110
{ TAG_IFRAME, "iframe", tag_find_urls },
111
{ TAG_IMG, "img", tag_find_urls },
112
{ TAG_INPUT, "input", tag_find_urls },
113
{ TAG_LAYER, "layer", tag_find_urls },
114
{ TAG_LINK, "link", tag_handle_link },
115
{ TAG_META, "meta", tag_handle_meta },
116
{ TAG_OBJECT, "object", tag_find_urls },
117
{ TAG_OVERLAY, "overlay", tag_find_urls },
118
{ TAG_SCRIPT, "script", tag_find_urls },
119
{ TAG_TABLE, "table", tag_find_urls },
120
{ TAG_TD, "td", tag_find_urls },
121
{ TAG_TH, "th", tag_find_urls }
92
{ TAG_A, "a", tag_find_urls },
93
{ TAG_APPLET, "applet", tag_find_urls },
94
{ TAG_AREA, "area", tag_find_urls },
95
{ TAG_BASE, "base", tag_handle_base },
96
{ TAG_BGSOUND, "bgsound", tag_find_urls },
97
{ TAG_BODY, "body", tag_find_urls },
98
{ TAG_EMBED, "embed", tag_find_urls },
99
{ TAG_FIG, "fig", tag_find_urls },
100
{ TAG_FORM, "form", tag_handle_form },
101
{ TAG_FRAME, "frame", tag_find_urls },
102
{ TAG_IFRAME, "iframe", tag_find_urls },
103
{ TAG_IMG, "img", tag_find_urls },
104
{ TAG_INPUT, "input", tag_find_urls },
105
{ TAG_LAYER, "layer", tag_find_urls },
106
{ TAG_LINK, "link", tag_handle_link },
107
{ TAG_META, "meta", tag_handle_meta },
108
{ TAG_OBJECT, "object", tag_find_urls },
109
{ TAG_OVERLAY, "overlay", tag_find_urls },
110
{ TAG_SCRIPT, "script", tag_find_urls },
111
{ TAG_TABLE, "table", tag_find_urls },
112
{ TAG_TD, "td", tag_find_urls },
113
{ TAG_TH, "th", tag_find_urls }
124
116
/* tag_url_attributes documents which attributes of which tags contain
145
137
const char *attr_name;
147
139
} tag_url_attributes[] = {
148
{ TAG_A, "href", ATTR_HTML },
149
{ TAG_APPLET, "code", ATTR_INLINE },
150
{ TAG_AREA, "href", ATTR_HTML },
151
{ TAG_BGSOUND, "src", ATTR_INLINE },
152
{ TAG_BODY, "background", ATTR_INLINE },
153
{ TAG_EMBED, "href", ATTR_HTML },
154
{ TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML },
155
{ TAG_FIG, "src", ATTR_INLINE },
156
{ TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML },
157
{ TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML },
158
{ TAG_IMG, "href", ATTR_INLINE },
159
{ TAG_IMG, "lowsrc", ATTR_INLINE },
160
{ TAG_IMG, "src", ATTR_INLINE },
161
{ TAG_INPUT, "src", ATTR_INLINE },
162
{ TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML },
163
{ TAG_OBJECT, "data", ATTR_INLINE },
164
{ TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML },
165
{ TAG_SCRIPT, "src", ATTR_INLINE },
166
{ TAG_TABLE, "background", ATTR_INLINE },
167
{ TAG_TD, "background", ATTR_INLINE },
168
{ TAG_TH, "background", ATTR_INLINE }
140
{ TAG_A, "href", ATTR_HTML },
141
{ TAG_APPLET, "code", ATTR_INLINE },
142
{ TAG_AREA, "href", ATTR_HTML },
143
{ TAG_BGSOUND, "src", ATTR_INLINE },
144
{ TAG_BODY, "background", ATTR_INLINE },
145
{ TAG_EMBED, "href", ATTR_HTML },
146
{ TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML },
147
{ TAG_FIG, "src", ATTR_INLINE },
148
{ TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML },
149
{ TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML },
150
{ TAG_IMG, "href", ATTR_INLINE },
151
{ TAG_IMG, "lowsrc", ATTR_INLINE },
152
{ TAG_IMG, "src", ATTR_INLINE },
153
{ TAG_INPUT, "src", ATTR_INLINE },
154
{ TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML },
155
{ TAG_OBJECT, "data", ATTR_INLINE },
156
{ TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML },
157
{ TAG_SCRIPT, "src", ATTR_INLINE },
158
{ TAG_TABLE, "background", ATTR_INLINE },
159
{ TAG_TD, "background", ATTR_INLINE },
160
{ TAG_TH, "background", ATTR_INLINE }
171
163
/* The lists of interesting tags and attributes are built dynamically,
172
164
from the information above. However, some places in the code refer
173
165
to the attributes not mentioned here. We add them manually. */
174
166
static const char *additional_attributes[] = {
175
"rel", /* used by tag_handle_link */
176
"http-equiv", /* used by tag_handle_meta */
177
"name", /* used by tag_handle_meta */
178
"content", /* used by tag_handle_meta */
179
"action" /* used by tag_handle_form */
167
"rel", /* used by tag_handle_link */
168
"http-equiv", /* used by tag_handle_meta */
169
"name", /* used by tag_handle_meta */
170
"content", /* used by tag_handle_meta */
171
"action" /* used by tag_handle_form */
182
struct hash_table *interesting_tags;
183
struct hash_table *interesting_attributes;
174
static struct hash_table *interesting_tags;
175
static struct hash_table *interesting_attributes;
186
178
init_interesting (void)
209
201
for (ignored = opt.ignore_tags; *ignored; ignored++)
210
hash_table_remove (interesting_tags, *ignored);
202
hash_table_remove (interesting_tags, *ignored);
213
205
/* If --follow-tags is specified, use only those tags. */
214
206
if (opt.follow_tags)
216
208
/* Create a new table intersecting --follow-tags and known_tags,
217
and use it as interesting_tags. */
209
and use it as interesting_tags. */
218
210
struct hash_table *intersect = make_nocase_string_hash_table (0);
220
212
for (followed = opt.follow_tags; *followed; followed++)
222
struct known_tag *t = hash_table_get (interesting_tags, *followed);
224
continue; /* ignore unknown --follow-tags entries. */
225
hash_table_put (intersect, *followed, t);
214
struct known_tag *t = hash_table_get (interesting_tags, *followed);
216
continue; /* ignore unknown --follow-tags entries. */
217
hash_table_put (intersect, *followed, t);
227
219
hash_table_destroy (interesting_tags);
228
220
interesting_tags = intersect;
248
240
for (i = 0; i < tag->nattrs; i++)
249
241
if (!strcasecmp (tag->attrs[i].name, name))
253
return tag->attrs[i].value;
245
return tag->attrs[i].value;
258
250
struct map_context {
259
char *text; /* HTML text. */
260
char *base; /* Base URI of the document, possibly
261
changed through <base href=...>. */
262
const char *parent_base; /* Base of the current document. */
263
const char *document_file; /* File name of this document. */
264
int nofollow; /* whether NOFOLLOW was specified in a
251
char *text; /* HTML text. */
252
char *base; /* Base URI of the document, possibly
253
changed through <base href=...>. */
254
const char *parent_base; /* Base of the current document. */
255
const char *document_file; /* File name of this document. */
256
bool nofollow; /* whether NOFOLLOW was specified in a
265
257
<meta name=robots> tag. */
267
struct urlpos *head, *tail; /* List of URLs that is being
259
struct urlpos *head, *tail; /* List of URLs that is being
271
263
/* Append LINK_URI to the urlpos structure that is being built.
288
280
DEBUGP (("%s: no base, merge will use \"%s\".\n",
289
ctx->document_file, link_uri));
281
ctx->document_file, link_uri));
291
283
if (!link_has_scheme)
293
/* Base URL is unavailable, and the link does not have a
294
location attached to it -- we have to give up. Since
295
this can only happen when using `--force-html -i', print
297
logprintf (LOG_NOTQUIET,
298
_("%s: Cannot resolve incomplete link %s.\n"),
299
ctx->document_file, link_uri);
285
/* Base URL is unavailable, and the link does not have a
286
location attached to it -- we have to give up. Since
287
this can only happen when using `--force-html -i', print
289
logprintf (LOG_NOTQUIET,
290
_("%s: Cannot resolve incomplete link %s.\n"),
291
ctx->document_file, link_uri);
303
295
url = url_parse (link_uri, NULL);
306
DEBUGP (("%s: link \"%s\" doesn't parse.\n",
307
ctx->document_file, link_uri));
298
DEBUGP (("%s: link \"%s\" doesn't parse.\n",
299
ctx->document_file, link_uri));
313
305
/* Merge BASE with LINK_URI, but also make sure the result is
314
canonicalized, i.e. that "../" have been resolved.
315
(parse_url will do that for us.) */
306
canonicalized, i.e. that "../" have been resolved.
307
(parse_url will do that for us.) */
317
309
char *complete_uri = uri_merge (base, link_uri);
319
311
DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",
320
ctx->document_file, base, link_uri, complete_uri));
312
ctx->document_file, base, link_uri, complete_uri));
322
314
url = url_parse (complete_uri, NULL);
325
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
326
ctx->document_file, complete_uri));
327
xfree (complete_uri);
317
DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
318
ctx->document_file, complete_uri));
319
xfree (complete_uri);
330
322
xfree (complete_uri);
389
381
for (attrind = 0; attrind < tag->nattrs; attrind++)
391
383
/* Find whether TAG/ATTRIND is a combination that contains a
393
385
char *link = tag->attrs[attrind].value;
394
386
const int size = countof (tag_url_attributes);
396
388
/* If you're cringing at the inefficiency of the nested loops,
397
remember that they both iterate over a very small number of
398
items. The worst-case inner loop is for the IMG tag, which
399
has three attributes. */
389
remember that they both iterate over a very small number of
390
items. The worst-case inner loop is for the IMG tag, which
391
has three attributes. */
400
392
for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
402
if (0 == strcasecmp (tag->attrs[attrind].name,
403
tag_url_attributes[i].attr_name))
405
struct urlpos *up = append_url (link, tag, attrind, ctx);
408
int flags = tag_url_attributes[i].flags;
409
if (flags & ATTR_INLINE)
410
up->link_inline_p = 1;
411
if (flags & ATTR_HTML)
412
up->link_expect_html = 1;
394
if (0 == strcasecmp (tag->attrs[attrind].name,
395
tag_url_attributes[i].attr_name))
397
struct urlpos *up = append_url (link, tag, attrind, ctx);
400
int flags = tag_url_attributes[i].flags;
401
if (flags & ATTR_INLINE)
402
up->link_inline_p = 1;
403
if (flags & ATTR_HTML)
404
up->link_expect_html = 1;
517
509
char *refresh = find_attr (tag, "content", &attrind);
521
513
for (p = refresh; ISDIGIT (*p); p++)
522
timeout = 10 * timeout + *p - '0';
514
timeout = 10 * timeout + *p - '0';
526
518
while (ISSPACE (*p))
528
520
if (!( TOUPPER (*p) == 'U'
529
&& TOUPPER (*(p + 1)) == 'R'
530
&& TOUPPER (*(p + 2)) == 'L'
521
&& TOUPPER (*(p + 1)) == 'R'
522
&& TOUPPER (*(p + 2)) == 'L'
534
526
while (ISSPACE (*p))
537
529
entry = append_url (p, tag, attrind, ctx);
540
entry->link_refresh_p = 1;
541
entry->refresh_timeout = timeout;
542
entry->link_expect_html = 1;
532
entry->link_refresh_p = 1;
533
entry->refresh_timeout = timeout;
534
entry->link_expect_html = 1;
545
537
else if (name && 0 == strcasecmp (name, "robots"))
547
539
/* Handle stuff like:
548
<meta name="robots" content="index,nofollow"> */
540
<meta name="robots" content="index,nofollow"> */
549
541
char *content = find_attr (tag, "content", NULL);
552
544
if (!strcasecmp (content, "none"))
545
ctx->nofollow = true;
558
/* Find the next occurrence of ',' or the end of
560
char *end = strchr (content, ',');
564
end = content + strlen (content);
565
if (!strncasecmp (content, "nofollow", end - content))
550
/* Find the next occurrence of ',' or the end of
552
char *end = strchr (content, ',');
556
end = content + strlen (content);
557
if (!strncasecmp (content, "nofollow", end - content))
558
ctx->nofollow = true;
671
663
const char *line_beg = text;
672
664
const char *line_end = memchr (text, '\n', text_end - text);
679
671
/* Strip whitespace from the beginning and end of line. */
680
672
while (line_beg < line_end && ISSPACE (*line_beg))
682
674
while (line_end > line_beg && ISSPACE (*(line_end - 1)))
685
677
if (line_beg == line_end)
688
680
/* The URL is in the [line_beg, line_end) region. */
690
682
/* We must copy the URL to a zero-terminated string, and we
691
can't use alloca because we're in a loop. *sigh*. */
683
can't use alloca because we're in a loop. *sigh*. */
692
684
url_text = strdupdelim (line_beg, line_end);
694
686
if (opt.base_href)
696
/* Merge opt.base_href with URL. */
697
char *merged = uri_merge (opt.base_href, url_text);
688
/* Merge opt.base_href with URL. */
689
char *merged = uri_merge (opt.base_href, url_text);
702
694
url = url_parse (url_text, &up_error_code);
705
logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
706
file, url_text, url_error (up_error_code));
697
logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
698
file, url_text, url_error (up_error_code));
710
702
xfree (url_text);
712
704
entry = xnew0 (struct urlpos);
714
705
entry->url = url;
722
713
read_file_free (fm);