53
54
static struct hash_table *dl_file_url_map;
54
55
struct hash_table *dl_url_file_map;
56
/* List of HTML files downloaded in this Wget run, used for link
57
conversion after Wget is done. The list and the set contain the
58
same information, except the list maintains the order. Perhaps I
59
should get rid of the list, it's there for historical reasons. */
60
static slist *downloaded_html_list;
57
/* Set of HTML files downloaded in this Wget run, used for link
58
conversion after Wget is done. */
61
59
struct hash_table *downloaded_html_set;
63
61
static void convert_links PARAMS ((const char *, struct urlpos *));
81
79
convert_all_links (void)
85
83
int file_count = 0;
87
struct wget_timer *timer = wtimer_new ();
89
/* Destructively reverse downloaded_html_files to get it in the right order.
90
recursive_retrieve() used slist_prepend() consistently. */
91
downloaded_html_list = slist_nreverse (downloaded_html_list);
93
for (html = downloaded_html_list; html; html = html->next)
85
struct ptimer *timer = ptimer_new ();
91
if (downloaded_html_set)
92
cnt = hash_table_count (downloaded_html_set);
95
file_array = alloca_array (char *, cnt);
96
string_set_to_array (downloaded_html_set, file_array);
98
for (i = 0; i < cnt; i++)
95
100
struct urlpos *urls, *cur_url;
97
char *file = html->string;
102
char *file = file_array[i];
99
104
/* Determine the URL of the HTML file. get_urls_html will need
327
332
logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);
330
/* Construct and return a malloced copy of the relative link from two
331
pieces of information: local name S1 of the referring file and
332
local name S2 of the referred file.
334
So, if S1 is "jagor.srce.hr/index.html" and S2 is
335
"jagor.srce.hr/images/news.gif", the function will return
338
Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is
339
"fly.cc.fer.hr/images/fly.gif", the function will return
342
Caveats: S1 should not begin with `/', unless S2 also begins with
343
'/'. S1 should not contain things like ".." and such --
344
construct_relative ("fly/ioccc/../index.html",
345
"fly/images/fly.gif") will fail. (A workaround is to call
346
something like path_simplify() on S1). */
335
/* Construct and return a link that points from BASEFILE to LINKFILE.
336
Both files should be local file names, BASEFILE of the referrering
337
file, and LINKFILE of the referred file.
341
cr("foo", "bar") -> "bar"
342
cr("A/foo", "A/bar") -> "bar"
343
cr("A/foo", "A/B/bar") -> "B/bar"
344
cr("A/X/foo", "A/Y/bar") -> "../Y/bar"
345
cr("X/", "Y/bar") -> "../Y/bar" (trailing slash does matter in BASE)
347
Both files should be absolute or relative, otherwise strange
348
results might ensue. The function makes no special efforts to
349
handle "." and ".." in links, so make sure they're not there
350
(e.g. using path_simplify). */
348
construct_relative (const char *s1, const char *s2)
353
construct_relative (const char *basefile, const char *linkfile)
350
int i, cnt, sepdirs1;
355
/* S1 should *not* be absolute, if S2 wasn't. */
358
/* Skip the directories common to both strings. */
361
while (s1[i] && s2[i]
366
if (s1[i] == '/' && s2[i] == '/')
371
for (sepdirs1 = 0; s1[i]; i++)
374
/* Now, construct the file as of:
375
- ../ repeated sepdirs1 time
376
- all the non-mutual directories of S2. */
377
res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);
378
for (i = 0; i < sepdirs1; i++)
379
memcpy (res + 3 * i, "../", 3);
380
strcpy (res + 3 * i, s2 + cnt);
360
/* First, skip the initial directory components common to both
363
for (b = basefile, l = linkfile; *b == *l && *b != '\0'; ++b, ++l)
366
start = (b - basefile) + 1;
371
/* With common directories out of the way, the situation we have is
373
b - b1/b2/[...]/bfile
374
l - l1/l2/[...]/lfile
376
The link we're constructing needs to be:
377
lnk - ../../l1/l2/[...]/lfile
379
Where the number of ".."'s equals the number of bN directory
382
/* Count the directory components in B. */
384
for (b = basefile; *b; b++)
390
/* Construct LINK as explained above. */
391
link = (char *)xmalloc (3 * basedirs + strlen (linkfile) + 1);
392
for (i = 0; i < basedirs; i++)
393
memcpy (link + 3 * i, "../", 3);
394
strcpy (link + 3 * i, linkfile);
398
/* Used by write_backup_file to remember which files have been
400
static struct hash_table *converted_files;
385
403
write_backup_file (const char *file, downloaded_file_t downloaded_file_return)
406
421
".html", so we need to compare vs. the original URL plus
407
422
".orig", not the original URL plus ".html.orig". */
408
423
filename_plus_orig_suffix = alloca (filename_len + 1);
409
strcpy(filename_plus_orig_suffix, file);
410
strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");
424
strcpy (filename_plus_orig_suffix, file);
425
strcpy ((filename_plus_orig_suffix + filename_len) - 4, "orig");
412
427
else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */
414
429
/* Append ".orig" to the name. */
415
filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));
416
strcpy(filename_plus_orig_suffix, file);
417
strcpy(filename_plus_orig_suffix + filename_len, ".orig");
430
filename_plus_orig_suffix = alloca (filename_len + sizeof (".orig"));
431
strcpy (filename_plus_orig_suffix, file);
432
strcpy (filename_plus_orig_suffix + filename_len, ".orig");
435
if (!converted_files)
436
converted_files = make_string_hash_table (0);
420
438
/* We can get called twice on the same URL thanks to the
421
439
convert_all_links() call in main(). If we write the .orig file
422
440
each time in such a case, it'll end up containing the first-pass
423
441
conversion, not the original file. So, see if we've already been
424
442
called on this file. */
425
converted_file_ptr = converted_files;
426
while (converted_file_ptr != NULL)
427
if (strcmp(converted_file_ptr->string, file) == 0)
429
already_wrote_backup_file = TRUE;
433
converted_file_ptr = converted_file_ptr->next;
435
if (!already_wrote_backup_file)
443
if (!string_set_contains (converted_files, file))
437
445
/* Rename <file> to <file>.orig before former gets written over. */
438
if (rename(file, filename_plus_orig_suffix) != 0)
446
if (rename (file, filename_plus_orig_suffix) != 0)
439
447
logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),
440
448
file, filename_plus_orig_suffix, strerror (errno));
566
571
"index.html%3Ffoo=bar" would break local browsing, as the latter
567
572
isn't even recognized as an HTML file! However, converting
568
573
"index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be
569
safe for both local and HTTP-served browsing. */
574
safe for both local and HTTP-served browsing.
576
We always quote "#" as "%23" and "%" as "%25" because those
577
characters have special meanings in URLs. */
572
580
local_quote_string (const char *file)
574
const char *file_sans_qmark;
577
if (!opt.html_extension)
585
char *any = strpbrk (file, "?#%");
578
587
return html_quote_string (file);
580
qm = count_char (file, '?');
584
const char *from = file;
587
/* qm * 2 because we replace each question mark with "%3F",
588
i.e. replace one char with three, hence two more. */
589
int fsqlen = strlen (file) + qm * 2;
591
to = newname = (char *)alloca (fsqlen + 1);
592
for (; *from; from++)
603
assert (to - newname == fsqlen);
606
file_sans_qmark = newname;
609
file_sans_qmark = file;
611
return html_quote_string (file_sans_qmark);
589
/* Allocate space assuming the worst-case scenario, each character
590
having to be quoted. */
591
to = newname = (char *)alloca (3 * strlen (file) + 1);
592
for (from = file; *from; from++)
606
if (opt.html_extension)
619
return html_quote_string (newname);
614
622
/* Book-keeping code for dl_file_url_map, dl_url_file_map,
827
835
if (!downloaded_html_set)
828
836
downloaded_html_set = make_string_hash_table (0);
829
else if (hash_table_contains (downloaded_html_set, file))
832
/* The set and the list should use the same copy of FILE, but the
833
slist interface insists on strduping the string it gets. Oh
835
837
string_set_add (downloaded_html_set, file);
836
downloaded_html_list = slist_prepend (downloaded_html_list, file);
839
/* Cleanup the data structures associated with recursive retrieving
840
(the variables above). */
840
static void downloaded_files_free PARAMS ((void));
842
/* Cleanup the data structures associated with this file. */
842
845
convert_cleanup (void)
957
961
downloaded_files_hash = NULL;
965
/* The function returns the pointer to the malloc-ed quoted version of
966
string s. It will recognize and quote numeric and special graphic
967
entities, as per RFC1866:
975
No other entities are recognized or replaced. */
977
html_quote_string (const char *s)
983
/* Pass through the string, and count the new size. */
984
for (i = 0; *s; s++, i++)
988
else if (*s == '<' || *s == '>')
989
i += 3; /* `lt;' and `gt;' */
991
i += 5; /* `quot;' */
995
res = (char *)xmalloc (i + 1);
997
for (p = res; *s; s++)
1010
*p++ = (*s == '<' ? 'l' : 'g');