~ubuntu-branches/ubuntu/dapper/wget/dapper-updates

« back to all changes in this revision

Viewing changes to src/convert.c

Committer: Bazaar Package Importer
Author(s): Noèl Köthe
Date: 2004-02-13 20:26:44 UTC
Revision ID: james.westby@ubuntu.com-20040213202644-skxj93qs15sskqfy

Tags: upstream-1.9.1

Import upstream version 1.9.1

files added:

AUTHORS

COPYING

ChangeLog

ChangeLog-branches

ChangeLog-branches/1.6_branch.ChangeLog

ChangeLog.README

INSTALL

MACHINES

MAILING-LIST

Makefile.cvs

Makefile.in

NEWS

PATCHES

README

README.cvs

TODO

aclocal.m4

config.guess

config.sub

configure

configure.bat

configure.bat.in

configure.in

doc/ChangeLog

doc/ChangeLog-branches

doc/ChangeLog-branches/1.6_branch.ChangeLog

doc/Makefile.in

doc/ansi2knr.1

doc/sample.wgetrc

doc/sample.wgetrc.munged_for_texi_inclusion

doc/texi2pod.pl.in

doc/texinfo.tex

doc/version.texi

doc/wget.info

doc/wget.info-1

doc/wget.info-2

doc/wget.info-3

doc/wget.info-4

doc/wget.texi

install-sh

libtool.m4

ltmain.sh

mkinstalldirs

po/Makefile.in.in

po/POTFILES.in

po/bg.gmo

po/bg.po

po/ca.gmo

po/ca.po

po/cs.gmo

po/cs.po

po/da.gmo

po/da.po

po/de.gmo

po/de.po

po/el.gmo

po/el.po

po/es.gmo

po/es.po

po/et.gmo

po/et.po

po/fr.gmo

po/fr.po

po/gl.gmo

po/gl.po

po/he.gmo

po/he.po

po/hr.gmo

po/hr.po

po/hu.gmo

po/hu.po

po/it.gmo

po/it.po

po/ja.gmo

po/ja.po

po/nl.gmo

po/nl.po

po/no.gmo

po/no.po

po/pl.gmo

po/pl.po

po/pt_BR.gmo

po/pt_BR.po

po/ro.gmo

po/ro.po

po/ru.gmo

po/ru.po

po/sk.gmo

po/sk.po

po/sl.gmo

po/sl.po

po/sv.gmo

po/sv.po

po/tr.gmo

po/tr.po

po/uk.gmo

po/uk.po

po/wget.pot

po/zh_CN.gmo

po/zh_CN.po

po/zh_TW.gmo

po/zh_TW.po

src/ChangeLog

src/ChangeLog-branches

src/ChangeLog-branches/1.6_branch.ChangeLog

src/ChangeLog-branches/1.8_branch.ChangeLog

src/Makefile.in

src/alloca.c

src/ansi2knr.c

src/cmpt.c

src/config.h.in

src/connect.c

src/connect.h

src/convert.c

src/convert.h

src/cookies.c

src/cookies.h

src/ftp-basic.c

src/ftp-ls.c

src/ftp-opie.c

src/ftp.c

src/ftp.h

src/gen-md5.c

src/gen-md5.h

src/gen_sslfunc.c

src/gen_sslfunc.h

src/getopt.c

src/getopt.h

src/gnu-md5.c

src/gnu-md5.h

src/hash.c

src/hash.h

src/headers.c

src/headers.h

src/host.c

src/host.h

src/html-parse.c

src/html-parse.h

src/html-url.c

src/http.c

src/init.c

src/init.h

src/log.c

src/main.c

src/mswindows.c

src/mswindows.h

src/netrc.c

src/netrc.h

src/options.h

src/progress.c

src/progress.h

src/rbuf.c

src/rbuf.h

src/recur.c

src/recur.h

src/res.c

src/res.h

src/retr.c

src/retr.h

src/safe-ctype.c

src/safe-ctype.h

src/snprintf.c

src/sysdep.h

src/url.c

src/url.h

src/utils.c

src/utils.h

src/version.c

src/wget.h

stamp-h.in

util

util/Makefile.in

util/README

util/dist-wget

util/download-netscape.html

util/download.html

util/rmold.pl

util/wget.spec

windows

windows/Makefile.doc

windows/Makefile.in

windows/Makefile.src

windows/Makefile.src.bor

windows/Makefile.top

windows/Makefile.top.bor

windows/Makefile.watcom

windows/README

windows/config.h.bor

windows/config.h.ms

windows/wget.dep

Show diffs side-by-side

added added

removed removed

src/convert.c

/* Conversion of links to local files.

This file is part of GNU Wget.

GNU Wget is free software; you can redistribute it and/or modify

it under the terms of the GNU General Public License as published by

the Free Software Foundation; either version 2 of the License, or

(at your option) any later version.

GNU Wget is distributed in the hope that it will be useful,

but WITHOUT ANY WARRANTY; without even the implied warranty of

MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

GNU General Public License for more details.

You should have received a copy of the GNU General Public License

along with Wget; if not, write to the Free Software

Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

In addition, as a special exception, the Free Software Foundation

gives permission to link the code of its release of Wget with the

OpenSSL project's "OpenSSL" library (or with modified versions of it

that use the same license as the "OpenSSL" library), and distribute

the linked executables. You must obey the GNU General Public License

in all respects for all of the code used other than "OpenSSL". If you

modify this file, you may extend this exception to your version of the

file, but you are not obligated to do so. If you do not wish to do

so, delete this exception statement from your version. */

#include <config.h>

#include <stdio.h>

#include <stdlib.h>

#ifdef HAVE_STRING_H

# include <string.h>

#else

# include <strings.h>

#endif /* HAVE_STRING_H */

#ifdef HAVE_UNISTD_H

# include <unistd.h>

#endif /* HAVE_UNISTD_H */

#include <errno.h>

#include <assert.h>

#include <sys/types.h>

#include "wget.h"

#include "convert.h"

#include "url.h"

#include "recur.h"

#include "utils.h"

#include "hash.h"

static struct hash_table *dl_file_url_map;

struct hash_table *dl_url_file_map;

/* List of HTML files downloaded in this Wget run, used for link

conversion after Wget is done. The list and the set contain the

same information, except the list maintains the order. Perhaps I

should get rid of the list, it's there for historical reasons. */

static slist *downloaded_html_list;

struct hash_table *downloaded_html_set;

static void convert_links PARAMS ((const char *, struct urlpos *));

/* This function is called when the retrieval is done to convert the

links that have been downloaded. It has to be called at the end of

the retrieval, because only then does Wget know conclusively which

URLs have been downloaded, and which not, so it can tell which

direction to convert to.

The "direction" means that the URLs to the files that have been

downloaded get converted to the relative URL which will point to

that file. And the other URLs get converted to the remote URL on

the server.

All the downloaded HTMLs are kept in downloaded_html_files, and

downloaded URLs in urls_downloaded. All the information is

extracted from these two lists. */

void

convert_all_links (void)

{

slist *html;

long msecs;

int file_count = 0;

struct wget_timer *timer = wtimer_new ();

/* Destructively reverse downloaded_html_files to get it in the right order.

recursive_retrieve() used slist_prepend() consistently. */

downloaded_html_list = slist_nreverse (downloaded_html_list);

for (html = downloaded_html_list; html; html = html->next)

{

struct urlpos *urls, *cur_url;

char *url;

char *file = html->string;

/* Determine the URL of the HTML file. get_urls_html will need

100

it. */

101

url = hash_table_get (dl_file_url_map, file);

102

if (!url)

103

{

104

DEBUGP (("Apparently %s has been removed.\n", file));

105

continue;

106

}

107

108

DEBUGP (("Scanning %s (from %s)\n", file, url));

109

110

/* Parse the HTML file... */

111

urls = get_urls_html (file, url, NULL);

112

113

/* We don't respect meta_disallow_follow here because, even if

114

the file is not followed, we might still want to convert the

115

links that have been followed from other files. */

116

117

for (cur_url = urls; cur_url; cur_url = cur_url->next)

118

{

119

char *local_name;

120

struct url *u = cur_url->url;

121

122

if (cur_url->link_base_p)

123

{

124

/* Base references have been resolved by our parser, so

125

we turn the base URL into an empty string. (Perhaps

126

we should remove the tag entirely?) */

127

cur_url->convert = CO_NULLIFY_BASE;

128

continue;

129

}

130

131

/* We decide the direction of conversion according to whether

132

a URL was downloaded. Downloaded URLs will be converted

133

ABS2REL, whereas non-downloaded will be converted REL2ABS. */

134

local_name = hash_table_get (dl_url_file_map, u->url);

135

136

/* Decide on the conversion type. */

137

if (local_name)

138

{

139

/* We've downloaded this URL. Convert it to relative

140

form. We do this even if the URL already is in

141

relative form, because our directory structure may

142

not be identical to that on the server (think `-nd',

143

`--cut-dirs', etc.) */

144

cur_url->convert = CO_CONVERT_TO_RELATIVE;

145

cur_url->local_name = xstrdup (local_name);

146

DEBUGP (("will convert url %s to local %s\n", u->url, local_name));

147

}

148

else

149

{

150

/* We haven't downloaded this URL. If it's not already

151

complete (including a full host name), convert it to

152

that form, so it can be reached while browsing this

153

HTML locally. */

154

if (!cur_url->link_complete_p)

155

cur_url->convert = CO_CONVERT_TO_COMPLETE;

156

cur_url->local_name = NULL;

157

DEBUGP (("will convert url %s to complete\n", u->url));

158

}

159

}

160

161

/* Convert the links in the file. */

162

convert_links (file, urls);

163

++file_count;

164

165

/* Free the data. */

166

free_urlpos (urls);

167

}

168

169

msecs = wtimer_elapsed (timer);

170

wtimer_delete (timer);

171

logprintf (LOG_VERBOSE, _("Converted %d files in %.2f seconds.\n"),

172

file_count, (double)msecs / 1000);

173

}

174

175

static void write_backup_file PARAMS ((const char *, downloaded_file_t));

176

static const char *replace_attr PARAMS ((const char *, int, FILE *,

177

const char *));

178

static const char *replace_attr_refresh_hack PARAMS ((const char *, int, FILE *,

179

const char *, int));

180

static char *local_quote_string PARAMS ((const char *));

181

static char *construct_relative PARAMS ((const char *, const char *));

182

183

/* Change the links in one HTML file. LINKS is a list of links in the

184

document, along with their positions and the desired direction of

185

the conversion. */

186

static void

187

convert_links (const char *file, struct urlpos *links)

188

{

189

struct file_memory *fm;

190

FILE *fp;

191

const char *p;

192

downloaded_file_t downloaded_file_return;

193

194

struct urlpos *link;

195

int to_url_count = 0, to_file_count = 0;

196

197

logprintf (LOG_VERBOSE, _("Converting %s... "), file);

198

199

{

200

/* First we do a "dry run": go through the list L and see whether

201

any URL needs to be converted in the first place. If not, just

202

leave the file alone. */

203

int dry_count = 0;

204

struct urlpos *dry = links;

205

for (dry = links; dry; dry = dry->next)

206

if (dry->convert != CO_NOCONVERT)

207

++dry_count;

208

if (!dry_count)

209

{

210

logputs (LOG_VERBOSE, _("nothing to do.\n"));

211

return;

212

}

213

}

214

215

fm = read_file (file);

216

if (!fm)

217

{

218

logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),

219

file, strerror (errno));

220

return;

221

}

222

223

downloaded_file_return = downloaded_file (CHECK_FOR_FILE, file);

224

if (opt.backup_converted && downloaded_file_return)

225

write_backup_file (file, downloaded_file_return);

226

227

/* Before opening the file for writing, unlink the file. This is

228

important if the data in FM is mmaped. In such case, nulling the

229

file, which is what fopen() below does, would make us read all

230

zeroes from the mmaped region. */

231

if (unlink (file) < 0 && errno != ENOENT)

232

{

233

logprintf (LOG_NOTQUIET, _("Unable to delete `%s': %s\n"),

234

file, strerror (errno));

235

read_file_free (fm);

236

return;

237

}

238

/* Now open the file for writing. */

239

fp = fopen (file, "wb");

240

if (!fp)

241

{

242

logprintf (LOG_NOTQUIET, _("Cannot convert links in %s: %s\n"),

243

file, strerror (errno));

244

read_file_free (fm);

245

return;

246

}

247

248

/* Here we loop through all the URLs in file, replacing those of

249

them that are downloaded with relative references. */

250

p = fm->content;

251

for (link = links; link; link = link->next)

252

{

253

char *url_start = fm->content + link->pos;

254

255

if (link->pos >= fm->length)

256

{

257

DEBUGP (("Something strange is going on. Please investigate."));

258

break;

259

}

260

/* If the URL is not to be converted, skip it. */

261

if (link->convert == CO_NOCONVERT)

262

{

263

DEBUGP (("Skipping %s at position %d.\n", link->url->url, link->pos));

264

continue;

265

}

266

267

/* Echo the file contents, up to the offending URL's opening

268

quote, to the outfile. */

269

fwrite (p, 1, url_start - p, fp);

270

p = url_start;

271

272

switch (link->convert)

273

{

274

case CO_CONVERT_TO_RELATIVE:

275

/* Convert absolute URL to relative. */

276

{

277

char *newname = construct_relative (file, link->local_name);

278

char *quoted_newname = local_quote_string (newname);

279

280

if (!link->link_refresh_p)

281

p = replace_attr (p, link->size, fp, quoted_newname);

282

else

283

p = replace_attr_refresh_hack (p, link->size, fp, quoted_newname,

284

link->refresh_timeout);

285

286

DEBUGP (("TO_RELATIVE: %s to %s at position %d in %s.\n",

287

link->url->url, newname, link->pos, file));

288

xfree (newname);

289

xfree (quoted_newname);

290

++to_file_count;

291

break;

292

}

293

case CO_CONVERT_TO_COMPLETE:

294

/* Convert the link to absolute URL. */

295

{

296

char *newlink = link->url->url;

297

char *quoted_newlink = html_quote_string (newlink);

298

299

if (!link->link_refresh_p)

300

p = replace_attr (p, link->size, fp, quoted_newlink);

301

else

302

p = replace_attr_refresh_hack (p, link->size, fp, quoted_newlink,

303

link->refresh_timeout);

304

305

DEBUGP (("TO_COMPLETE: <something> to %s at position %d in %s.\n",

306

newlink, link->pos, file));

307

xfree (quoted_newlink);

308

++to_url_count;

309

break;

310

}

311

case CO_NULLIFY_BASE:

312

/* Change the base href to "". */

313

p = replace_attr (p, link->size, fp, "");

314

break;

315

case CO_NOCONVERT:

316

abort ();

317

break;

318

}

319

}

320

321

/* Output the rest of the file. */

322

if (p - fm->content < fm->length)

323

fwrite (p, 1, fm->length - (p - fm->content), fp);

324

fclose (fp);

325

read_file_free (fm);

326

327

logprintf (LOG_VERBOSE, "%d-%d\n", to_file_count, to_url_count);

328

}

329

330

/* Construct and return a malloced copy of the relative link from two

331

pieces of information: local name S1 of the referring file and

332

local name S2 of the referred file.

333

334

So, if S1 is "jagor.srce.hr/index.html" and S2 is

335

"jagor.srce.hr/images/news.gif", the function will return

336

"images/news.gif".

337

338

Alternately, if S1 is "fly.cc.fer.hr/ioccc/index.html", and S2 is

339

"fly.cc.fer.hr/images/fly.gif", the function will return

340

"../images/fly.gif".

341

342

Caveats: S1 should not begin with `/', unless S2 also begins with

343

'/'. S1 should not contain things like ".." and such --

344

construct_relative ("fly/ioccc/../index.html",

345

"fly/images/fly.gif") will fail. (A workaround is to call

346

something like path_simplify() on S1). */

347

static char *

348

construct_relative (const char *s1, const char *s2)

349

{

350

int i, cnt, sepdirs1;

351

char *res;

352

353

if (*s2 == '/')

354

return xstrdup (s2);

355

/* S1 should *not* be absolute, if S2 wasn't. */

356

assert (*s1 != '/');

357

i = cnt = 0;

358

/* Skip the directories common to both strings. */

359

while (1)

360

{

361

while (s1[i] && s2[i]

362

&& (s1[i] == s2[i])

363

&& (s1[i] != '/')

364

&& (s2[i] != '/'))

365

++i;

366

if (s1[i] == '/' && s2[i] == '/')

367

cnt = ++i;

368

else

369

break;

370

}

371

for (sepdirs1 = 0; s1[i]; i++)

372

if (s1[i] == '/')

373

++sepdirs1;

374

/* Now, construct the file as of:

375

- ../ repeated sepdirs1 time

376

- all the non-mutual directories of S2. */

377

res = (char *)xmalloc (3 * sepdirs1 + strlen (s2 + cnt) + 1);

378

for (i = 0; i < sepdirs1; i++)

379

memcpy (res + 3 * i, "../", 3);

380

strcpy (res + 3 * i, s2 + cnt);

381

return res;

382

}

383

384

static void

385

write_backup_file (const char *file, downloaded_file_t downloaded_file_return)

386

{

387

/* Rather than just writing over the original .html file with the

388

converted version, save the former to *.orig. Note we only do

389

this for files we've _successfully_ downloaded, so we don't

390

clobber .orig files sitting around from previous invocations. */

391

392

/* Construct the backup filename as the original name plus ".orig". */

393

size_t filename_len = strlen(file);

394

char* filename_plus_orig_suffix;

395

boolean already_wrote_backup_file = FALSE;

396

slist* converted_file_ptr;

397

static slist* converted_files = NULL;

398

399

if (downloaded_file_return == FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED)

400

{

401

/* Just write "orig" over "html". We need to do it this way

402

because when we're checking to see if we've downloaded the

403

file before (to see if we can skip downloading it), we don't

404

know if it's a text/html file. Therefore we don't know yet

405

at that stage that -E is going to cause us to tack on

406

".html", so we need to compare vs. the original URL plus

407

".orig", not the original URL plus ".html.orig". */

408

filename_plus_orig_suffix = alloca (filename_len + 1);

409

strcpy(filename_plus_orig_suffix, file);

410

strcpy((filename_plus_orig_suffix + filename_len) - 4, "orig");

411

}

412

else /* downloaded_file_return == FILE_DOWNLOADED_NORMALLY */

413

{

414

/* Append ".orig" to the name. */

415

filename_plus_orig_suffix = alloca (filename_len + sizeof(".orig"));

416

strcpy(filename_plus_orig_suffix, file);

417

strcpy(filename_plus_orig_suffix + filename_len, ".orig");

418

}

419

420

/* We can get called twice on the same URL thanks to the

421

convert_all_links() call in main(). If we write the .orig file

422

each time in such a case, it'll end up containing the first-pass

423

conversion, not the original file. So, see if we've already been

424

called on this file. */

425

converted_file_ptr = converted_files;

426

while (converted_file_ptr != NULL)

427

if (strcmp(converted_file_ptr->string, file) == 0)

428

{

429

already_wrote_backup_file = TRUE;

430

break;

431

}

432

else

433

converted_file_ptr = converted_file_ptr->next;

434

435

if (!already_wrote_backup_file)

436

{

437

/* Rename <file> to <file>.orig before former gets written over. */

438

if (rename(file, filename_plus_orig_suffix) != 0)

439

logprintf (LOG_NOTQUIET, _("Cannot back up %s as %s: %s\n"),

440

file, filename_plus_orig_suffix, strerror (errno));

441

442

/* Remember that we've already written a .orig backup for this file.

443

Note that we never free this memory since we need it till the

444

convert_all_links() call, which is one of the last things the

445

program does before terminating. BTW, I'm not sure if it would be

446

safe to just set 'converted_file_ptr->string' to 'file' below,

447

rather than making a copy of the string... Another note is that I

448

thought I could just add a field to the urlpos structure saying

449

that we'd written a .orig file for this URL, but that didn't work,

450

so I had to make this separate list.

451

-- Dan Harkless <wget@harkless.org>

452

453

This [adding a field to the urlpos structure] didn't work

454

because convert_file() is called from convert_all_links at

455

the end of the retrieval with a freshly built new urlpos

456

list.

457

-- Hrvoje Niksic <hniksic@xemacs.org>

458

459

converted_file_ptr = xmalloc(sizeof(*converted_file_ptr));

460

converted_file_ptr->string = xstrdup(file); /* die on out-of-mem. */

461

converted_file_ptr->next = converted_files;

462

converted_files = converted_file_ptr;

463

}

464

}

465

466

static int find_fragment PARAMS ((const char *, int, const char **,

467

const char **));

468

469

/* Replace an attribute's original text with NEW_TEXT. */

470

471

static const char *

472

replace_attr (const char *p, int size, FILE *fp, const char *new_text)

473

{

474

int quote_flag = 0;

475

char quote_char = '\"'; /* use "..." for quoting, unless the

476

original value is quoted, in which

477

case reuse its quoting char. */

478

const char *frag_beg, *frag_end;

479

480

/* Structure of our string is:

481

"...old-contents..."

482

<--- size ---> (with quotes)

483

OR:

484

...old-contents...

485

<--- size --> (no quotes) */

486

487

if (*p == '\"' || *p == '\'')

488

{

489

quote_char = *p;

490

quote_flag = 1;

491

++p;

492

size -= 2; /* disregard opening and closing quote */

493

}

494

putc (quote_char, fp);

495

fputs (new_text, fp);

496

497

/* Look for fragment identifier, if any. */

498

if (find_fragment (p, size, &frag_beg, &frag_end))

499

fwrite (frag_beg, 1, frag_end - frag_beg, fp);

500

p += size;

501

if (quote_flag)

502

++p;

503

putc (quote_char, fp);

504

505

return p;

506

}

507

508

/* The same as REPLACE_ATTR, but used when replacing

509

510

append "timeout_value; URL=" before the next_text. */

511

512

static const char *

513

replace_attr_refresh_hack (const char *p, int size, FILE *fp,

514

const char *new_text, int timeout)

515

{

516

/* "0; URL=..." */

517

char *new_with_timeout = (char *)alloca (numdigit (timeout)

518

+ 6 /* "; URL=" */

519

+ strlen (new_text)

520

+ 1);

521

sprintf (new_with_timeout, "%d; URL=%s", timeout, new_text);

522

523

return replace_attr (p, size, fp, new_with_timeout);

524

}

525

526

/* Find the first occurrence of '#' in [BEG, BEG+SIZE) that is not

527

preceded by '&'. If the character is not found, return zero. If

528

the character is found, return 1 and set BP and EP to point to the

529

beginning and end of the region.

530

531

This is used for finding the fragment indentifiers in URLs. */

532

533

static int

534

find_fragment (const char *beg, int size, const char **bp, const char **ep)

535

{

536

const char *end = beg + size;

537

int saw_amp = 0;

538

for (; beg < end; beg++)

539

{

540

switch (*beg)

541

{

542

case '&':

543

saw_amp = 1;

544

break;

545

case '#':

546

if (!saw_amp)

547

{

548

*bp = beg;

549

*ep = end;

550

return 1;

551

}

552

/* fallthrough */

553

default:

554

saw_amp = 0;

555

}

556

}

557

return 0;

558

}

559

560

/* Quote FILE for use as local reference to an HTML file.

561

562

We quote ? as %3F to avoid passing part of the file name as the

563

parameter when browsing the converted file through HTTP. However,

564

it is safe to do this only when `--html-extension' is turned on.

565

This is because converting "index.html?foo=bar" to

566

"index.html%3Ffoo=bar" would break local browsing, as the latter

567

isn't even recognized as an HTML file! However, converting

568

"index.html?foo=bar.html" to "index.html%3Ffoo=bar.html" should be

569

safe for both local and HTTP-served browsing. */

570

571

static char *

572

local_quote_string (const char *file)

573

{

574

const char *file_sans_qmark;

575

int qm;

576

577

if (!opt.html_extension)

578

return html_quote_string (file);

579

580

qm = count_char (file, '?');

581

582

if (qm)

583

{

584

const char *from = file;

585

char *to, *newname;

586

587

/* qm * 2 because we replace each question mark with "%3F",

588

i.e. replace one char with three, hence two more. */

589

int fsqlen = strlen (file) + qm * 2;

590

591

to = newname = (char *)alloca (fsqlen + 1);

592

for (; *from; from++)

593

{

594

if (*from != '?')

595

*to++ = *from;

596

else

597

{

598

*to++ = '%';

599

*to++ = '3';

600

*to++ = 'F';

601

}

602

}

603

assert (to - newname == fsqlen);

604

*to = '\0';

605

606

file_sans_qmark = newname;

607

}

608

else

609

file_sans_qmark = file;

610

611

return html_quote_string (file_sans_qmark);

612

}

613

614

/* Book-keeping code for dl_file_url_map, dl_url_file_map,

615

downloaded_html_list, and downloaded_html_set. Other code calls

616

these functions to let us know that a file has been downloaded. */

617

618

#define ENSURE_TABLES_EXIST do { \

619

if (!dl_file_url_map) \

620

dl_file_url_map = make_string_hash_table (0); \

621

if (!dl_url_file_map) \

622

dl_url_file_map = make_string_hash_table (0); \

623

} while (0)

624

625

/* Return 1 if S1 and S2 are the same, except for "/index.html". The

626

three cases in which it returns one are (substitute any substring

627

for "foo"):

628

629

m("foo/index.html", "foo/") ==> 1

630

m("foo/", "foo/index.html") ==> 1

631

m("foo", "foo/index.html") ==> 1

632

m("foo", "foo/" ==> 1

633

m("foo", "foo") ==> 1 */

634

635

static int

636

match_except_index (const char *s1, const char *s2)

637

{

638

int i;

639

const char *lng;

640

641

/* Skip common substring. */

642

for (i = 0; *s1 && *s2 && *s1 == *s2; s1++, s2++, i++)

643

;

644

if (i == 0)

645

/* Strings differ at the very beginning -- bail out. We need to

646

check this explicitly to avoid `lng - 1' reading outside the

647

array. */

648

return 0;

649

650

if (!*s1 && !*s2)

651

/* Both strings hit EOF -- strings are equal. */

652

return 1;

653

else if (*s1 && *s2)

654

/* Strings are randomly different, e.g. "/foo/bar" and "/foo/qux". */

655

return 0;

656

else if (*s1)

657

/* S1 is the longer one. */

658

lng = s1;

659

else

660

/* S2 is the longer one. */

661

lng = s2;

662

663

/* foo */ /* foo/ */

664

/* foo/index.html */ /* or */ /* foo/index.html */

665

/* ^ */ /* ^ */

666

667

if (*lng != '/')

668

/* The right-hand case. */

669

--lng;

670

671

if (*lng == '/' && *(lng + 1) == '\0')

672

/* foo */

673

/* foo/ */

674

return 1;

675

676

return 0 == strcmp (lng, "/index.html");

677

}

678

679

static int

680

dissociate_urls_from_file_mapper (void *key, void *value, void *arg)

681

{

682

char *mapping_url = (char *)key;

683

char *mapping_file = (char *)value;

684

char *file = (char *)arg;

685

686

if (0 == strcmp (mapping_file, file))

687

{

688

hash_table_remove (dl_url_file_map, mapping_url);

689

xfree (mapping_url);

690

xfree (mapping_file);

691

}

692

693

/* Continue mapping. */

694

return 0;

695

}

696

697

/* Remove all associations from various URLs to FILE from dl_url_file_map. */

698

699

static void

700

dissociate_urls_from_file (const char *file)

701

{

702

hash_table_map (dl_url_file_map, dissociate_urls_from_file_mapper,

703

(char *)file);

704

}

705

706

/* Register that URL has been successfully downloaded to FILE. This

707

is used by the link conversion code to convert references to URLs

708

to references to local files. It is also being used to check if a

709

URL has already been downloaded. */

710

711

void

712

register_download (const char *url, const char *file)

713

{

714

char *old_file, *old_url;

715

716

ENSURE_TABLES_EXIST;

717

718

/* With some forms of retrieval, it is possible, although not likely

719

or particularly desirable. If both are downloaded, the second

720

download will override the first one. When that happens,

721

dissociate the old file name from the URL. */

722

723

if (hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))

724

{

725

if (0 == strcmp (url, old_url))

726

/* We have somehow managed to download the same URL twice.

727

Nothing to do. */

728

return;

729

730

if (match_except_index (url, old_url)

731

&& !hash_table_contains (dl_url_file_map, url))

732

/* The two URLs differ only in the "index.html" ending. For

733

example, one is "http://www.server.com/", and the other is

734

"http://www.server.com/index.html". Don't remove the old

735

one, just add the new one as a non-canonical entry. */

736

goto url_only;

737

738

hash_table_remove (dl_file_url_map, file);

739

xfree (old_file);

740

xfree (old_url);

741

742

/* Remove all the URLs that point to this file. Yes, there can

743

be more than one such URL, because we store redirections as

744

multiple entries in dl_url_file_map. For example, if URL1

745

redirects to URL2 which gets downloaded to FILE, we map both

746

URL1 and URL2 to FILE in dl_url_file_map. (dl_file_url_map

747

only points to URL2.) When another URL gets loaded to FILE,

748

we want both URL1 and URL2 dissociated from it.

749

750

This is a relatively expensive operation because it performs

751

a linear search of the whole hash table, but it should be

752

called very rarely, only when two URLs resolve to the same

753

file name, *and* the "<file>.1" extensions are turned off.

754

In other words, almost never. */

755

dissociate_urls_from_file (file);

756

}

757

758

hash_table_put (dl_file_url_map, xstrdup (file), xstrdup (url));

759

760

url_only:

761

/* A URL->FILE mapping is not possible without a FILE->URL mapping.

762

If the latter were present, it should have been removed by the

763

above `if'. So we could write:

764

765

assert (!hash_table_contains (dl_url_file_map, url));

766

767

The above is correct when running in recursive mode where the

768

same URL always resolves to the same file. But if you do

769

something like:

770

771

wget URL URL

772

773

then the first URL will resolve to "FILE", and the other to

774

"FILE.1". In that case, FILE.1 will not be found in

775

dl_file_url_map, but URL will still point to FILE in

776

dl_url_file_map. */

777

if (hash_table_get_pair (dl_url_file_map, url, &old_url, &old_file))

778

{

779

hash_table_remove (dl_url_file_map, url);

780

xfree (old_url);

781

xfree (old_file);

782

}

783

784

hash_table_put (dl_url_file_map, xstrdup (url), xstrdup (file));

785

}

786

787

/* Register that FROM has been redirected to TO. This assumes that TO

788

is successfully downloaded and already registered using

789

register_download() above. */

790

791

void

792

register_redirection (const char *from, const char *to)

793

{

794

char *file;

795

796

ENSURE_TABLES_EXIST;

797

798

file = hash_table_get (dl_url_file_map, to);

799

assert (file != NULL);

800

if (!hash_table_contains (dl_url_file_map, from))

801

hash_table_put (dl_url_file_map, xstrdup (from), xstrdup (file));

802

}

803

804

/* Register that the file has been deleted. */

805

806

void

807

register_delete_file (const char *file)

808

{

809

char *old_url, *old_file;

810

811

ENSURE_TABLES_EXIST;

812

813

if (!hash_table_get_pair (dl_file_url_map, file, &old_file, &old_url))

814

return;

815

816

hash_table_remove (dl_file_url_map, file);

817

xfree (old_file);

818

xfree (old_url);

819

dissociate_urls_from_file (file);

820

}

821

822

/* Register that FILE is an HTML file that has been downloaded. */

823

824

void

825

register_html (const char *url, const char *file)

826

{

827

if (!downloaded_html_set)

828

downloaded_html_set = make_string_hash_table (0);

829

else if (hash_table_contains (downloaded_html_set, file))

830

return;

831

832

/* The set and the list should use the same copy of FILE, but the

833

slist interface insists on strduping the string it gets. Oh

834

well. */

835

string_set_add (downloaded_html_set, file);

836

downloaded_html_list = slist_prepend (downloaded_html_list, file);

837

}

838

839

/* Cleanup the data structures associated with recursive retrieving

840

(the variables above). */

841

void

842

convert_cleanup (void)

843

{

844

if (dl_file_url_map)

845

{

846

free_keys_and_values (dl_file_url_map);

847

hash_table_destroy (dl_file_url_map);

848

dl_file_url_map = NULL;

849

}

850

if (dl_url_file_map)

851

{

852

free_keys_and_values (dl_url_file_map);

853

hash_table_destroy (dl_url_file_map);

854

dl_url_file_map = NULL;

855

}

856

if (downloaded_html_set)

857

string_set_free (downloaded_html_set);

858

slist_free (downloaded_html_list);

859

downloaded_html_list = NULL;

860

}

861

862

/* Book-keeping code for downloaded files that enables extension

863

hacks. */

864

865

/* This table should really be merged with dl_file_url_map and

866

downloaded_html_files. This was originally a list, but I changed

867

it to a hash table beause it was actually taking a lot of time to

868

find things in it. */

869

870

static struct hash_table *downloaded_files_hash;

871

872

/* We're storing "modes" of type downloaded_file_t in the hash table.

873

However, our hash tables only accept pointers for keys and values.

874

So when we need a pointer, we use the address of a

875

downloaded_file_t variable of static storage. */

876

877

static downloaded_file_t *

878

downloaded_mode_to_ptr (downloaded_file_t mode)

879

{

880

static downloaded_file_t

881

v1 = FILE_NOT_ALREADY_DOWNLOADED,

882

v2 = FILE_DOWNLOADED_NORMALLY,

883

v3 = FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED,

884

v4 = CHECK_FOR_FILE;

885

886

switch (mode)

887

{

888

case FILE_NOT_ALREADY_DOWNLOADED:

889

return &v1;

890

case FILE_DOWNLOADED_NORMALLY:

891

return &v2;

892

case FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED:

893

return &v3;

894

case CHECK_FOR_FILE:

895

return &v4;

896

}

897

return NULL;

898

}

899

900

/* Remembers which files have been downloaded. In the standard case,

901

should be called with mode == FILE_DOWNLOADED_NORMALLY for each

902

file we actually download successfully (i.e. not for ones we have

903

failures on or that we skip due to -N).

904

905

When we've downloaded a file and tacked on a ".html" extension due

906

to -E, call this function with

907

FILE_DOWNLOADED_AND_HTML_EXTENSION_ADDED rather than

908

FILE_DOWNLOADED_NORMALLY.

909

910

If you just want to check if a file has been previously added

911

without adding it, call with mode == CHECK_FOR_FILE. Please be

912

sure to call this function with local filenames, not remote

913

URLs. */

914

915

downloaded_file_t

916

downloaded_file (downloaded_file_t mode, const char *file)

917

{

918

downloaded_file_t *ptr;

919

920

if (mode == CHECK_FOR_FILE)

921

{

922

if (!downloaded_files_hash)

923

return FILE_NOT_ALREADY_DOWNLOADED;

924

ptr = hash_table_get (downloaded_files_hash, file);

925

if (!ptr)

926

return FILE_NOT_ALREADY_DOWNLOADED;

927

return *ptr;

928

}

929

930

if (!downloaded_files_hash)

931

downloaded_files_hash = make_string_hash_table (0);

932

933

ptr = hash_table_get (downloaded_files_hash, file);

934

if (ptr)

935

return *ptr;

936

937

ptr = downloaded_mode_to_ptr (mode);

938

hash_table_put (downloaded_files_hash, xstrdup (file), &ptr);

939

940

return FILE_NOT_ALREADY_DOWNLOADED;

941

}

942

943

static int

944

df_free_mapper (void *key, void *value, void *ignored)

945

{

946

xfree (key);

947

return 0;

948

}

949

950

void

951

downloaded_files_free (void)

952

{

953

if (downloaded_files_hash)

954

{

955

hash_table_map (downloaded_files_hash, df_free_mapper, NULL);

956

hash_table_destroy (downloaded_files_hash);

957

downloaded_files_hash = NULL;

958

}

959

}

Older »