~ubuntu-branches/ubuntu/hardy/wget/hardy-security

« back to all changes in this revision

Viewing changes to src/recur.c

Committer: Bazaar Package Importer
Author(s): Noèl Köthe
Date: 2004-02-13 20:26:44 UTC
Revision ID: james.westby@ubuntu.com-20040213202644-skxj93qs15sskqfy

Tags: upstream-1.9.1

Import upstream version 1.9.1

files added:

AUTHORS

COPYING

ChangeLog

ChangeLog-branches

ChangeLog-branches/1.6_branch.ChangeLog

ChangeLog.README

INSTALL

MACHINES

MAILING-LIST

Makefile.cvs

Makefile.in

NEWS

PATCHES

README

README.cvs

TODO

aclocal.m4

config.guess

config.sub

configure

configure.bat

configure.bat.in

configure.in

doc/ChangeLog

doc/ChangeLog-branches

doc/ChangeLog-branches/1.6_branch.ChangeLog

doc/Makefile.in

doc/ansi2knr.1

doc/sample.wgetrc

doc/sample.wgetrc.munged_for_texi_inclusion

doc/texi2pod.pl.in

doc/texinfo.tex

doc/version.texi

doc/wget.info

doc/wget.info-1

doc/wget.info-2

doc/wget.info-3

doc/wget.info-4

doc/wget.texi

install-sh

libtool.m4

ltmain.sh

mkinstalldirs

po/Makefile.in.in

po/POTFILES.in

po/bg.gmo

po/bg.po

po/ca.gmo

po/ca.po

po/cs.gmo

po/cs.po

po/da.gmo

po/da.po

po/de.gmo

po/de.po

po/el.gmo

po/el.po

po/es.gmo

po/es.po

po/et.gmo

po/et.po

po/fr.gmo

po/fr.po

po/gl.gmo

po/gl.po

po/he.gmo

po/he.po

po/hr.gmo

po/hr.po

po/hu.gmo

po/hu.po

po/it.gmo

po/it.po

po/ja.gmo

po/ja.po

po/nl.gmo

po/nl.po

po/no.gmo

po/no.po

po/pl.gmo

po/pl.po

po/pt_BR.gmo

po/pt_BR.po

po/ro.gmo

po/ro.po

po/ru.gmo

po/ru.po

po/sk.gmo

po/sk.po

po/sl.gmo

po/sl.po

po/sv.gmo

po/sv.po

po/tr.gmo

po/tr.po

po/uk.gmo

po/uk.po

po/wget.pot

po/zh_CN.gmo

po/zh_CN.po

po/zh_TW.gmo

po/zh_TW.po

src/ChangeLog

src/ChangeLog-branches

src/ChangeLog-branches/1.6_branch.ChangeLog

src/ChangeLog-branches/1.8_branch.ChangeLog

src/Makefile.in

src/alloca.c

src/ansi2knr.c

src/cmpt.c

src/config.h.in

src/connect.c

src/connect.h

src/convert.c

src/convert.h

src/cookies.c

src/cookies.h

src/ftp-basic.c

src/ftp-ls.c

src/ftp-opie.c

src/ftp.c

src/ftp.h

src/gen-md5.c

src/gen-md5.h

src/gen_sslfunc.c

src/gen_sslfunc.h

src/getopt.c

src/getopt.h

src/gnu-md5.c

src/gnu-md5.h

src/hash.c

src/hash.h

src/headers.c

src/headers.h

src/host.c

src/host.h

src/html-parse.c

src/html-parse.h

src/html-url.c

src/http.c

src/init.c

src/init.h

src/log.c

src/main.c

src/mswindows.c

src/mswindows.h

src/netrc.c

src/netrc.h

src/options.h

src/progress.c

src/progress.h

src/rbuf.c

src/rbuf.h

src/recur.c

src/recur.h

src/res.c

src/res.h

src/retr.c

src/retr.h

src/safe-ctype.c

src/safe-ctype.h

src/snprintf.c

src/sysdep.h

src/url.c

src/url.h

src/utils.c

src/utils.h

src/version.c

src/wget.h

stamp-h.in

util

util/Makefile.in

util/README

util/dist-wget

util/download-netscape.html

util/download.html

util/rmold.pl

util/wget.spec

windows

windows/Makefile.doc

windows/Makefile.in

windows/Makefile.src

windows/Makefile.src.bor

windows/Makefile.top

windows/Makefile.top.bor

windows/Makefile.watcom

windows/README

windows/config.h.bor

windows/config.h.ms

windows/wget.dep

Show diffs side-by-side

added added

removed removed

src/recur.c

/* Handling of recursive HTTP retrieving.

This file is part of GNU Wget.

GNU Wget is free software; you can redistribute it and/or modify

it under the terms of the GNU General Public License as published by

the Free Software Foundation; either version 2 of the License, or

(at your option) any later version.

GNU Wget is distributed in the hope that it will be useful,

but WITHOUT ANY WARRANTY; without even the implied warranty of

MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

GNU General Public License for more details.

You should have received a copy of the GNU General Public License

along with Wget; if not, write to the Free Software

Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

In addition, as a special exception, the Free Software Foundation

gives permission to link the code of its release of Wget with the

OpenSSL project's "OpenSSL" library (or with modified versions of it

that use the same license as the "OpenSSL" library), and distribute

the linked executables. You must obey the GNU General Public License

in all respects for all of the code used other than "OpenSSL". If you

modify this file, you may extend this exception to your version of the

file, but you are not obligated to do so. If you do not wish to do

so, delete this exception statement from your version. */

#include <config.h>

#include <stdio.h>

#include <stdlib.h>

#ifdef HAVE_STRING_H

# include <string.h>

#else

# include <strings.h>

#endif /* HAVE_STRING_H */

#ifdef HAVE_UNISTD_H

# include <unistd.h>

#endif /* HAVE_UNISTD_H */

#include <errno.h>

#include <assert.h>

#include <sys/types.h>

#include "wget.h"

#include "url.h"

#include "recur.h"

#include "utils.h"

#include "retr.h"

#include "ftp.h"

#include "host.h"

#include "hash.h"

#include "res.h"

#include "convert.h"

#ifndef errno

extern int errno;

#endif

extern char *version_string;

extern LARGE_INT total_downloaded_bytes;

extern struct hash_table *dl_url_file_map;

extern struct hash_table *downloaded_html_set;

/* Functions for maintaining the URL queue. */

struct queue_element {

const char *url; /* the URL to download */

const char *referer; /* the referring document */

int depth; /* the depth */

unsigned int html_allowed :1; /* whether the document is allowed to

be treated as HTML. */

struct queue_element *next; /* next element in queue */

};

struct url_queue {

struct queue_element *head;

struct queue_element *tail;

int count, maxcount;

};

/* Create a URL queue. */

static struct url_queue *

url_queue_new (void)

{

struct url_queue *queue = xmalloc (sizeof (*queue));

memset (queue, '\0', sizeof (*queue));

return queue;

}

/* Delete a URL queue. */

static void

url_queue_delete (struct url_queue *queue)

{

100

xfree (queue);

101

}

102

103

/* Enqueue a URL in the queue. The queue is FIFO: the items will be

104

retrieved ("dequeued") from the queue in the order they were placed

105

into it. */

106

107

static void

108

url_enqueue (struct url_queue *queue,

109

const char *url, const char *referer, int depth, int html_allowed)

110

{

111

struct queue_element *qel = xmalloc (sizeof (*qel));

112

qel->url = url;

113

qel->referer = referer;

114

qel->depth = depth;

115

qel->html_allowed = html_allowed;

116

qel->next = NULL;

117

118

++queue->count;

119

if (queue->count > queue->maxcount)

120

queue->maxcount = queue->count;

121

122

DEBUGP (("Enqueuing %s at depth %d\n", url, depth));

123

DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));

124

125

if (queue->tail)

126

queue->tail->next = qel;

127

queue->tail = qel;

128

129

if (!queue->head)

130

queue->head = queue->tail;

131

}

132

133

/* Take a URL out of the queue. Return 1 if this operation succeeded,

134

or 0 if the queue is empty. */

135

136

static int

137

url_dequeue (struct url_queue *queue,

138

const char **url, const char **referer, int *depth,

139

int *html_allowed)

140

{

141

struct queue_element *qel = queue->head;

142

143

if (!qel)

144

return 0;

145

146

queue->head = queue->head->next;

147

if (!queue->head)

148

queue->tail = NULL;

149

150

*url = qel->url;

151

*referer = qel->referer;

152

*depth = qel->depth;

153

*html_allowed = qel->html_allowed;

154

155

--queue->count;

156

157

DEBUGP (("Dequeuing %s at depth %d\n", qel->url, qel->depth));

158

DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));

159

160

xfree (qel);

161

return 1;

162

}

163

164

static int download_child_p PARAMS ((const struct urlpos *, struct url *, int,

165

struct url *, struct hash_table *));

166

static int descend_redirect_p PARAMS ((const char *, const char *, int,

167

struct url *, struct hash_table *));

168

169

170

/* Retrieve a part of the web beginning with START_URL. This used to

171

be called "recursive retrieval", because the old function was

172

recursive and implemented depth-first search. retrieve_tree on the

173

other hand implements breadth-search traversal of the tree, which

174

results in much nicer ordering of downloads.

175

176

The algorithm this function uses is simple:

177

178

1. put START_URL in the queue.

179

2. while there are URLs in the queue:

180

181

3. get next URL from the queue.

182

4. download it.

183

5. if the URL is HTML and its depth does not exceed maximum depth,

184

get the list of URLs embedded therein.

185

6. for each of those URLs do the following:

186

187

7. if the URL is not one of those downloaded before, and if it

188

satisfies the criteria specified by the various command-line

189

options, add it to the queue. */

190

191

uerr_t

192

retrieve_tree (const char *start_url)

193

{

194

uerr_t status = RETROK;

195

196

/* The queue of URLs we need to load. */

197

struct url_queue *queue;

198

199

/* The URLs we do not wish to enqueue, because they are already in

200

the queue, but haven't been downloaded yet. */

201

struct hash_table *blacklist;

202

203

int up_error_code;

204

struct url *start_url_parsed = url_parse (start_url, &up_error_code);

205

206

if (!start_url_parsed)

207

{

208

logprintf (LOG_NOTQUIET, "%s: %s.\n", start_url,

209

url_error (up_error_code));

210

return URLERROR;

211

}

212

213

queue = url_queue_new ();

214

blacklist = make_string_hash_table (0);

215

216

/* Enqueue the starting URL. Use start_url_parsed->url rather than

217

just URL so we enqueue the canonical form of the URL. */

218

url_enqueue (queue, xstrdup (start_url_parsed->url), NULL, 0, 1);

219

string_set_add (blacklist, start_url_parsed->url);

220

221

while (1)

222

{

223

int descend = 0;

224

char *url, *referer, *file = NULL;

225

int depth, html_allowed;

226

boolean dash_p_leaf_HTML = FALSE;

227

228

if (opt.quota && total_downloaded_bytes > opt.quota)

229

break;

230

if (status == FWRITEERR)

231

break;

232

233

/* Get the next URL from the queue... */

234

235

if (!url_dequeue (queue,

236

(const char **)&url, (const char **)&referer,

237

&depth, &html_allowed))

238

break;

239

240

/* ...and download it. Note that this download is in most cases

241

unconditional, as download_child_p already makes sure a file

242

doesn't get enqueued twice -- and yet this check is here, and

243

not in download_child_p. This is so that if you run `wget -r

244

URL1 URL2', and a random URL is encountered once under URL1

245

and again under URL2, but at a different (possibly smaller)

246

depth, we want the URL's children to be taken into account

247

the second time. */

248

if (dl_url_file_map && hash_table_contains (dl_url_file_map, url))

249

{

250

file = xstrdup (hash_table_get (dl_url_file_map, url));

251

252

DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",

253

url, file));

254

255

if (html_allowed

256

&& downloaded_html_set

257

&& string_set_contains (downloaded_html_set, file))

258

descend = 1;

259

}

260

else

261

{

262

int dt = 0;

263

char *redirected = NULL;

264

int oldrec = opt.recursive;

265

266

opt.recursive = 0;

267

status = retrieve_url (url, &file, &redirected, referer, &dt);

268

opt.recursive = oldrec;

269

270

if (html_allowed && file && status == RETROK

271

&& (dt & RETROKF) && (dt & TEXTHTML))

272

descend = 1;

273

274

if (redirected)

275

{

276

/* We have been redirected, possibly to another host, or

277

different path, or wherever. Check whether we really

278

want to follow it. */

279

if (descend)

280

{

281

if (!descend_redirect_p (redirected, url, depth,

282

start_url_parsed, blacklist))

283

descend = 0;

284

else

285

/* Make sure that the old pre-redirect form gets

286

blacklisted. */

287

string_set_add (blacklist, url);

288

}

289

290

xfree (url);

291

url = redirected;

292

}

293

}

294

295

if (descend

296

&& depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)

297

{

298

if (opt.page_requisites

299

&& (depth == opt.reclevel || depth == opt.reclevel + 1))

300

{

301

/* When -p is specified, we are allowed to exceed the

302

maximum depth, but only for the "inline" links,

303

i.e. those that are needed to display the page.

304

Originally this could exceed the depth at most by

305

one, but we allow one more level so that the leaf

306

pages that contain frames can be loaded

307

correctly. */

308

dash_p_leaf_HTML = TRUE;

309

}

310

else

311

{

312

/* Either -p wasn't specified or it was and we've

313

already spent the two extra (pseudo-)levels that it

314

affords us, so we need to bail out. */

315

DEBUGP (("Not descending further; at depth %d, max. %d.\n",

316

depth, opt.reclevel));

317

descend = 0;

318

}

319

}

320

321

/* If the downloaded document was HTML, parse it and enqueue the

322

links it contains. */

323

324

if (descend)

325

{

326

int meta_disallow_follow = 0;

327

struct urlpos *children

328

= get_urls_html (file, url, &meta_disallow_follow);

329

330

if (opt.use_robots && meta_disallow_follow)

331

{

332

free_urlpos (children);

333

children = NULL;

334

}

335

336

if (children)

337

{

338

struct urlpos *child = children;

339

struct url *url_parsed = url_parsed = url_parse (url, NULL);

340

assert (url_parsed != NULL);

341

342

for (; child; child = child->next)

343

{

344

if (child->ignore_when_downloading)

345

continue;

346

if (dash_p_leaf_HTML && !child->link_inline_p)

347

continue;

348

if (download_child_p (child, url_parsed, depth, start_url_parsed,

349

blacklist))

350

{

351

url_enqueue (queue, xstrdup (child->url->url),

352

xstrdup (url), depth + 1,

353

child->link_expect_html);

354

/* We blacklist the URL we have enqueued, because we

355

don't want to enqueue (and hence download) the

356

same URL twice. */

357

string_set_add (blacklist, child->url->url);

358

}

359

}

360

361

url_free (url_parsed);

362

free_urlpos (children);

363

}

364

}

365

366

if (opt.delete_after || (file && !acceptable (file)))

367

{

368

/* Either --delete-after was specified, or we loaded this

369

otherwise rejected (e.g. by -R) HTML file just so we

370

could harvest its hyperlinks -- in either case, delete

371

the local file. */

372

DEBUGP (("Removing file due to %s in recursive_retrieve():\n",

373

opt.delete_after ? "--delete-after" :

374

"recursive rejection criteria"));

375

logprintf (LOG_VERBOSE,

376

(opt.delete_after

377

? _("Removing %s.\n")

378

: _("Removing %s since it should be rejected.\n")),

379

file);

380

if (unlink (file))

381

logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));

382

register_delete_file (file);

383

}

384

385

xfree (url);

386

FREE_MAYBE (referer);

387

FREE_MAYBE (file);

388

}

389

390

/* If anything is left of the queue due to a premature exit, free it

391

now. */

392

{

393

char *d1, *d2;

394

int d3, d4;

395

while (url_dequeue (queue,

396

(const char **)&d1, (const char **)&d2, &d3, &d4))

397

{

398

xfree (d1);

399

FREE_MAYBE (d2);

400

}

401

}

402

url_queue_delete (queue);

403

404

if (start_url_parsed)

405

url_free (start_url_parsed);

406

string_set_free (blacklist);

407

408

if (opt.quota && total_downloaded_bytes > opt.quota)

409

return QUOTEXC;

410

else if (status == FWRITEERR)

411

return FWRITEERR;

412

else

413

return RETROK;

414

}

415

416

/* Based on the context provided by retrieve_tree, decide whether a

417

URL is to be descended to. This is only ever called from

418

retrieve_tree, but is in a separate function for clarity.

419

420

The most expensive checks (such as those for robots) are memoized

421

by storing these URLs to BLACKLIST. This may or may not help. It

422

will help if those URLs are encountered many times. */

423

424

static int

425

download_child_p (const struct urlpos *upos, struct url *parent, int depth,

426

struct url *start_url_parsed, struct hash_table *blacklist)

427

{

428

struct url *u = upos->url;

429

const char *url = u->url;

430

int u_scheme_like_http;

431

432

DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));

433

434

if (string_set_contains (blacklist, url))

435

{

436

DEBUGP (("Already on the black list.\n"));

437

goto out;

438

}

439

440

/* Several things to check for:

441

1. if scheme is not http, and we don't load it

442

2. check for relative links (if relative_only is set)

443

3. check for domain

444

4. check for no-parent

445

5. check for excludes && includes

446

6. check for suffix

447

7. check for same host (if spanhost is unset), with possible

448

gethostbyname baggage

449

8. check for robots.txt

450

451

Addendum: If the URL is FTP, and it is to be loaded, only the

452

domain and suffix settings are "stronger".

453

454

Note that .html files will get loaded regardless of suffix rules

455

(but that is remedied later with unlink) unless the depth equals

456

the maximum depth.

457

458

More time- and memory- consuming tests should be put later on

459

the list. */

460

461

/* Determine whether URL under consideration has a HTTP-like scheme. */

462

u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP);

463

464

/* 1. Schemes other than HTTP are normally not recursed into. */

465

if (!u_scheme_like_http && !(u->scheme == SCHEME_FTP && opt.follow_ftp))

466

{

467

DEBUGP (("Not following non-HTTP schemes.\n"));

468

goto out;

469

}

470

471

/* 2. If it is an absolute link and they are not followed, throw it

472

out. */

473

if (u_scheme_like_http)

474

if (opt.relative_only && !upos->link_relative_p)

475

{

476

DEBUGP (("It doesn't really look like a relative link.\n"));

477

goto out;

478

}

479

480

/* 3. If its domain is not to be accepted/looked-up, chuck it

481

out. */

482

if (!accept_domain (u))

483

{

484

DEBUGP (("The domain was not accepted.\n"));

485

goto out;

486

}

487

488

/* 4. Check for parent directory.

489

490

If we descended to a different host or changed the scheme, ignore

491

opt.no_parent. Also ignore it for documents needed to display

492

the parent page when in -p mode. */

493

if (opt.no_parent

494

&& schemes_are_similar_p (u->scheme, start_url_parsed->scheme)

495

&& 0 == strcasecmp (u->host, start_url_parsed->host)

496

&& u->port == start_url_parsed->port

497

&& !(opt.page_requisites && upos->link_inline_p))

498

{

499

if (!frontcmp (start_url_parsed->dir, u->dir))

500

{

501

DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",

502

u->dir, start_url_parsed->dir));

503

goto out;

504

}

505

}

506

507

/* 5. If the file does not match the acceptance list, or is on the

508

rejection list, chuck it out. The same goes for the directory

509

exclusion and inclusion lists. */

510

if (opt.includes || opt.excludes)

511

{

512

if (!accdir (u->dir, ALLABS))

513

{

514

DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));

515

goto out;

516

}

517

}

518

519

/* 6. Check for acceptance/rejection rules. We ignore these rules

520

for directories (no file name to match) and for HTML documents,

521

which might lead to other files that do need to be downloaded.

522

That is, unless we've exhausted the recursion depth anyway. */

523

if (u->file[0] != '\0'

524

&& !(has_html_suffix_p (u->file)

525

&& depth != INFINITE_RECURSION

526

&& depth < opt.reclevel - 1))

527

{

528

if (!acceptable (u->file))

529

{

530

DEBUGP (("%s (%s) does not match acc/rej rules.\n",

531

url, u->file));

532

goto out;

533

}

534

}

535

536

/* 7. */

537

if (schemes_are_similar_p (u->scheme, parent->scheme))

538

if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))

539

{

540

DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",

541

u->host, parent->host));

542

goto out;

543

}

544

545

/* 8. */

546

if (opt.use_robots && u_scheme_like_http)

547

{

548

struct robot_specs *specs = res_get_specs (u->host, u->port);

549

if (!specs)

550

{

551

char *rfile;

552

if (res_retrieve_file (url, &rfile))

553

{

554

specs = res_parse_from_file (rfile);

555

xfree (rfile);

556

}

557

else

558

{

559

/* If we cannot get real specs, at least produce

560

dummy ones so that we can register them and stop

561

trying to retrieve them. */

562

specs = res_parse ("", 0);

563

}

564

res_register_specs (u->host, u->port, specs);

565

}

566

567

/* Now that we have (or don't have) robots.txt specs, we can

568

check what they say. */

569

if (!res_match_path (specs, u->path))

570

{

571

DEBUGP (("Not following %s because robots.txt forbids it.\n", url));

572

string_set_add (blacklist, url);

573

goto out;

574

}

575

}

576

577

/* The URL has passed all the tests. It can be placed in the

578

download queue. */

579

DEBUGP (("Decided to load it.\n"));

580

581

return 1;

582

583

out:

584

DEBUGP (("Decided NOT to load it.\n"));

585

586

return 0;

587

}

588

589

/* This function determines whether we will consider downloading the

590

children of a URL whose download resulted in a redirection,

591

possibly to another host, etc. It is needed very rarely, and thus

592

it is merely a simple-minded wrapper around download_child_p. */

593

594

static int

595

descend_redirect_p (const char *redirected, const char *original, int depth,

596

struct url *start_url_parsed, struct hash_table *blacklist)

597

{

598

struct url *orig_parsed, *new_parsed;

599

struct urlpos *upos;

600

int success;

601

602

orig_parsed = url_parse (original, NULL);

603

assert (orig_parsed != NULL);

604

605

new_parsed = url_parse (redirected, NULL);

606

assert (new_parsed != NULL);

607

608

upos = xmalloc (sizeof (struct urlpos));

609

memset (upos, 0, sizeof (*upos));

610

upos->url = new_parsed;

611

612

success = download_child_p (upos, orig_parsed, depth,

613

start_url_parsed, blacklist);

614

615

url_free (orig_parsed);

616

url_free (new_parsed);

617

xfree (upos);

618

619

if (!success)

620

DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));

621

622

return success;

623

}

Older »