~ubuntu-branches/ubuntu/lucid/wget/lucid-security

Committer: Bazaar Package Importer
Author(s): Michael Vogt
Date: 2008-05-27 11:49:54 UTC
mfrom: (1.1.3 upstream)
Revision ID: james.westby@ubuntu.com-20080527114954-ame070pjhqtofeaf

Tags: 1.11.2-1ubuntu1

* Merge from debian unstable, remaining changes:
- Add wget-udeb to ship wget.gnu as alternative to busybox wget
implementation.

files added:
DISTFILES

autom4te.cache

doc/texi2pod.pl

doc/wget.1

doc/wget.pod

msdos

msdos/ChangeLog

msdos/Makefile.DJ

msdos/Makefile.WC

msdos/config.h

po/be.po

po/id.po

po/nb.po

po/pt.po

src/gnutls.c

src/http.h

src/spider.c

src/spider.h

src/test.c

src/test.h

tests

tests/ChangeLog

tests/FTPServer.pm

tests/FTPTest.pm

tests/HTTPServer.pm

tests/HTTPTest.pm

tests/Makefile.in

tests/README

tests/Test--no-content-disposition-trivial.px

tests/Test--no-content-disposition.px

tests/Test--spider--no-content-disposition-trivial.px

tests/Test--spider--no-content-disposition.px

tests/Test--spider-HTTP-Content-Disposition.px

tests/Test--spider-fail.px

tests/Test--spider-r--no-content-disposition-trivial.px

tests/Test--spider-r--no-content-disposition.px

tests/Test--spider-r-HTTP-Content-Disposition.px

tests/Test--spider-r.px

tests/Test--spider.px

tests/Test-E-k-K.px

tests/Test-E-k.px

tests/Test-HTTP-Content-Disposition-1.px

tests/Test-HTTP-Content-Disposition-2.px

tests/Test-HTTP-Content-Disposition.px

tests/Test-N--no-content-disposition-trivial.px

tests/Test-N--no-content-disposition.px

tests/Test-N-HTTP-Content-Disposition.px

tests/Test-N-current.px

tests/Test-N-old.px

tests/Test-N.px

tests/Test-O--no-content-disposition-trivial.px

tests/Test-O--no-content-disposition.px

tests/Test-O-HTTP-Content-Disposition.px

tests/Test-O-nonexisting.px

tests/Test-O.px

tests/Test-Restrict-Lowercase.px

tests/Test-Restrict-Uppercase.px

tests/Test-auth-basic.px

tests/Test-c-full.px

tests/Test-c-partial.px

tests/Test-c.px

tests/Test-ftp.px

tests/Test-nonexisting-quiet.px

tests/Test-noop.px

tests/Test-np.px

tests/WgetTest.pm.in

tests/certs

windows/config-compiler.h

windows/config.h

files removed:
PATCHES

README.checkout

TODO

configure.bat.in

debian/patches/wget-de.po-spelling-correction

doc/ansi2knr.1

doc/gpl.texi

doc/sample.wgetrc.munged_for_texi_inclusion

doc/texi2pod.pl.in

po/no.po

src/ansi2knr.c

util/dist-wget

util/download-netscape.html

util/download.html

util/update_po_files.sh

util/wget.spec

windows/Makefile.watcom

windows/config.h.bor

windows/config.h.mingw

windows/config.h.ms

files modified:
AUTHORS

COPYING

ChangeLog

ChangeLog.README

INSTALL

MAILING-LIST

Makefile.in

NEWS

README

autogen.sh

config.guess

config.sub

configure

configure.bat *

configure.in

debian/changelog

debian/compat

debian/control

debian/copyright

debian/docs

debian/patches/00list

debian/rules

doc/ChangeLog

doc/Makefile.in

doc/fdl.texi

doc/texinfo.tex

doc/version.texi

doc/wget.info

doc/wget.texi

m4/lib-ld.m4

m4/lib-link.m4

m4/lib-prefix.m4

m4/wget.m4

po/POTFILES.in

po/bg.po

po/ca.po

po/cs.po

po/da.po

po/de.po

po/el.po

po/en_GB.po

po/eo.po

po/es.po

po/et.po

po/eu.po

po/fi.po

po/fr.po

po/ga.po

po/gl.po

po/he.po

po/hr.po

po/hu.po

po/it.po

po/ja.po

po/nl.po

po/pl.po

po/pt_BR.po

po/ro.po

po/ru.po

po/sk.po

po/sl.po

po/sr.po

po/sv.po

po/tr.po

po/uk.po

po/vi.po

po/zh_CN.po

po/zh_TW.po

src/ChangeLog

src/Makefile.in

src/alloca.c

src/cmpt.c

src/config-post.h

src/config.h.in

src/connect.c

src/connect.h

src/convert.c

src/convert.h

src/cookies.c

src/cookies.h

src/ftp-basic.c

src/ftp-ls.c

src/ftp-opie.c

src/ftp.c

src/ftp.h

src/gen-md5.c

src/gen-md5.h

src/getopt.c

src/getopt.h

src/gnu-md5.c

src/gnu-md5.h

src/hash.c

src/hash.h

src/host.c

src/host.h

src/html-parse.c

src/html-parse.h

src/html-url.c

src/http-ntlm.c

src/http-ntlm.h

src/http.c

src/init.c

src/init.h

src/log.c

src/log.h

src/main.c

src/mswindows.c

src/mswindows.h

src/netrc.c

src/netrc.h

src/openssl.c

src/options.h

src/progress.c

src/progress.h

src/ptimer.c

src/ptimer.h

src/recur.c

src/recur.h

src/res.c

src/res.h

src/retr.c

src/retr.h

src/safe-ctype.c

src/safe-ctype.h

src/snprintf.c

src/ssl.h

src/sysdep.h

src/url.c

src/url.h

src/utils.c

src/utils.h

src/version.c

src/wget.h

src/xmalloc.c

src/xmalloc.h

util/Makefile.in

util/README

util/rmold.pl

windows/ChangeLog

windows/Makefile.doc

windows/Makefile.in

windows/Makefile.src

windows/Makefile.src.bor

windows/Makefile.src.mingw

windows/Makefile.top

windows/Makefile.top.bor

windows/Makefile.top.mingw

windows/README

Show diffs side-by-side

added added

removed removed

src/html-url.c

/* Collect URLs from HTML source.

2007, 2008 Free Software Foundation, Inc.

This file is part of GNU Wget.

GNU Wget is free software; you can redistribute it and/or modify

it under the terms of the GNU General Public License as published by

the Free Software Foundation; either version 2 of the License, or

the Free Software Foundation; either version 3 of the License, or

(at your option) any later version.

GNU Wget is distributed in the hope that it will be useful,

GNU General Public License for more details.

You should have received a copy of the GNU General Public License

along with Wget; if not, write to the Free Software

Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

In addition, as a special exception, the Free Software Foundation

gives permission to link the code of its release of Wget with the

OpenSSL project's "OpenSSL" library (or with modified versions of it

that use the same license as the "OpenSSL" library), and distribute

the linked executables. You must obey the GNU General Public License

in all respects for all of the code used other than "OpenSSL". If you

modify this file, you may extend this exception to your version of the

file, but you are not obligated to do so. If you do not wish to do

so, delete this exception statement from your version. */

along with Wget. If not, see <http://www.gnu.org/licenses/>.

Additional permission under GNU GPL version 3 section 7

If you modify this program, or any covered work, by linking or

combining it with the OpenSSL project's OpenSSL library (or a

modified version of that library), containing parts covered by the

terms of the OpenSSL or SSLeay licenses, the Free Software Foundation

grants you additional permission to convey the resulting work.

Corresponding Source for a non-source form of such a combination

shall include the source code for the parts of OpenSSL used as well

as that of the covered work. */

#include <config.h>

#include <stdio.h>

#ifdef HAVE_STRING_H

# include <string.h>

#else

# include <strings.h>

#endif

#include <string.h>

#include <stdlib.h>

#include <errno.h>

#include <assert.h>

#include "utils.h"

#include "hash.h"

#include "convert.h"

#include "recur.h" /* declaration of get_urls_html */

#ifndef errno

extern int errno;

#endif

#include "recur.h" /* declaration of get_urls_html */

struct map_context;

typedef void (*tag_handler_t) PARAMS ((int, struct taginfo *,

struct map_context *));

typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);

#define DECLARE_TAG_HANDLER(fun) \

static void fun PARAMS ((int, struct taginfo *, struct map_context *))

#define DECLARE_TAG_HANDLER(fun) \

static void fun (int, struct taginfo *, struct map_context *)

DECLARE_TAG_HANDLER (tag_find_urls);

DECLARE_TAG_HANDLER (tag_handle_base);

const char *name;

tag_handler_t handler;

} known_tags[] = {

100

{ TAG_A, "a", tag_find_urls },

101

{ TAG_APPLET, "applet", tag_find_urls },

102

{ TAG_AREA, "area", tag_find_urls },

103

{ TAG_BASE, "base", tag_handle_base },

104

{ TAG_BGSOUND, "bgsound", tag_find_urls },

105

{ TAG_BODY, "body", tag_find_urls },

106

{ TAG_EMBED, "embed", tag_find_urls },

107

{ TAG_FIG, "fig", tag_find_urls },

108

{ TAG_FORM, "form", tag_handle_form },

109

{ TAG_FRAME, "frame", tag_find_urls },

110

{ TAG_IFRAME, "iframe", tag_find_urls },

111

{ TAG_IMG, "img", tag_find_urls },

112

{ TAG_INPUT, "input", tag_find_urls },

113

{ TAG_LAYER, "layer", tag_find_urls },

114

{ TAG_LINK, "link", tag_handle_link },

115

{ TAG_META, "meta", tag_handle_meta },

116

{ TAG_OBJECT, "object", tag_find_urls },

117

{ TAG_OVERLAY, "overlay", tag_find_urls },

118

{ TAG_SCRIPT, "script", tag_find_urls },

119

{ TAG_TABLE, "table", tag_find_urls },

120

{ TAG_TD, "td", tag_find_urls },

121

{ TAG_TH, "th", tag_find_urls }

{ TAG_A, "a", tag_find_urls },

{ TAG_APPLET, "applet", tag_find_urls },

{ TAG_AREA, "area", tag_find_urls },

{ TAG_BASE, "base", tag_handle_base },

{ TAG_BGSOUND, "bgsound", tag_find_urls },

{ TAG_BODY, "body", tag_find_urls },

{ TAG_EMBED, "embed", tag_find_urls },

{ TAG_FIG, "fig", tag_find_urls },

100

{ TAG_FORM, "form", tag_handle_form },

101

{ TAG_FRAME, "frame", tag_find_urls },

102

{ TAG_IFRAME, "iframe", tag_find_urls },

103

{ TAG_IMG, "img", tag_find_urls },

104

{ TAG_INPUT, "input", tag_find_urls },

105

{ TAG_LAYER, "layer", tag_find_urls },

106

{ TAG_LINK, "link", tag_handle_link },

107

{ TAG_META, "meta", tag_handle_meta },

108

{ TAG_OBJECT, "object", tag_find_urls },

109

{ TAG_OVERLAY, "overlay", tag_find_urls },

110

{ TAG_SCRIPT, "script", tag_find_urls },

111

{ TAG_TABLE, "table", tag_find_urls },

112

{ TAG_TD, "td", tag_find_urls },

113

{ TAG_TH, "th", tag_find_urls }

122

114

};

123

115

124

116

/* tag_url_attributes documents which attributes of which tags contain

129

121

/* The link is "inline", i.e. needs to be retrieved for this document

130

122

to be correctly rendered. Inline links include inlined images,

131

123

stylesheets, children frames, etc. */

132

#define ATTR_INLINE 1

124

#define ATTR_INLINE 1

133

125

134

126

/* The link is expected to yield HTML contents. It's important not to

135

127

try to follow HTML obtained by following e.g. <img src="...">

136

128

regardless of content-type. Doing this causes infinite loops for

137

129

"images" that return non-404 error pages with links to the same

138

130

image. */

139

#define ATTR_HTML 2

131

#define ATTR_HTML 2

140

132

141

133

/* For tags handled by tag_find_urls: attributes that contain URLs to

142

134

download. */

145

137

const char *attr_name;

146

138

int flags;

147

139

} tag_url_attributes[] = {

148

{ TAG_A, "href", ATTR_HTML },

149

{ TAG_APPLET, "code", ATTR_INLINE },

150

{ TAG_AREA, "href", ATTR_HTML },

151

{ TAG_BGSOUND, "src", ATTR_INLINE },

152

{ TAG_BODY, "background", ATTR_INLINE },

153

{ TAG_EMBED, "href", ATTR_HTML },

154

{ TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML },

155

{ TAG_FIG, "src", ATTR_INLINE },

156

{ TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML },

157

{ TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML },

158

{ TAG_IMG, "href", ATTR_INLINE },

159

{ TAG_IMG, "lowsrc", ATTR_INLINE },

160

{ TAG_IMG, "src", ATTR_INLINE },

161

{ TAG_INPUT, "src", ATTR_INLINE },

162

{ TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML },

163

{ TAG_OBJECT, "data", ATTR_INLINE },

164

{ TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML },

165

{ TAG_SCRIPT, "src", ATTR_INLINE },

166

{ TAG_TABLE, "background", ATTR_INLINE },

167

{ TAG_TD, "background", ATTR_INLINE },

168

{ TAG_TH, "background", ATTR_INLINE }

140

{ TAG_A, "href", ATTR_HTML },

141

{ TAG_APPLET, "code", ATTR_INLINE },

142

{ TAG_AREA, "href", ATTR_HTML },

143

{ TAG_BGSOUND, "src", ATTR_INLINE },

144

{ TAG_BODY, "background", ATTR_INLINE },

145

{ TAG_EMBED, "href", ATTR_HTML },

146

{ TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML },

147

{ TAG_FIG, "src", ATTR_INLINE },

148

{ TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML },

149

{ TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML },

150

{ TAG_IMG, "href", ATTR_INLINE },

151

{ TAG_IMG, "lowsrc", ATTR_INLINE },

152

{ TAG_IMG, "src", ATTR_INLINE },

153

{ TAG_INPUT, "src", ATTR_INLINE },

154

{ TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML },

155

{ TAG_OBJECT, "data", ATTR_INLINE },

156

{ TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML },

157

{ TAG_SCRIPT, "src", ATTR_INLINE },

158

{ TAG_TABLE, "background", ATTR_INLINE },

159

{ TAG_TD, "background", ATTR_INLINE },

160

{ TAG_TH, "background", ATTR_INLINE }

169

161

};

170

162

171

163

/* The lists of interesting tags and attributes are built dynamically,

172

164

from the information above. However, some places in the code refer

173

165

to the attributes not mentioned here. We add them manually. */

174

166

static const char *additional_attributes[] = {

175

"rel", /* used by tag_handle_link */

176

"http-equiv", /* used by tag_handle_meta */

177

"name", /* used by tag_handle_meta */

178

"content", /* used by tag_handle_meta */

179

"action" /* used by tag_handle_form */

167

"rel", /* used by tag_handle_link */

168

"http-equiv", /* used by tag_handle_meta */

169

"name", /* used by tag_handle_meta */

170

"content", /* used by tag_handle_meta */

171

"action" /* used by tag_handle_form */

180

172

};

181

173

182

struct hash_table *interesting_tags;

183

struct hash_table *interesting_attributes;

174

static struct hash_table *interesting_tags;

175

static struct hash_table *interesting_attributes;

184

176

185

177

static void

186

178

init_interesting (void)

207

199

{

208

200

char **ignored;

209

201

for (ignored = opt.ignore_tags; *ignored; ignored++)

210

hash_table_remove (interesting_tags, *ignored);

202

hash_table_remove (interesting_tags, *ignored);

211

203

}

212

204

213

205

/* If --follow-tags is specified, use only those tags. */

214

206

if (opt.follow_tags)

215

207

{

216

208

/* Create a new table intersecting --follow-tags and known_tags,

217

and use it as interesting_tags. */

209

and use it as interesting_tags. */

218

210

struct hash_table *intersect = make_nocase_string_hash_table (0);

219

211

char **followed;

220

212

for (followed = opt.follow_tags; *followed; followed++)

221

{

222

struct known_tag *t = hash_table_get (interesting_tags, *followed);

223

if (!t)

224

continue; /* ignore unknown --follow-tags entries. */

225

hash_table_put (intersect, *followed, t);

226

}

213

{

214

struct known_tag *t = hash_table_get (interesting_tags, *followed);

215

if (!t)

216

continue; /* ignore unknown --follow-tags entries. */

217

hash_table_put (intersect, *followed, t);

218

}

227

219

hash_table_destroy (interesting_tags);

228

220

interesting_tags = intersect;

229

221

}

234

226

hash_table_put (interesting_attributes, additional_attributes[i], "1");

235

227

for (i = 0; i < countof (tag_url_attributes); i++)

236

228

hash_table_put (interesting_attributes,

237

tag_url_attributes[i].attr_name, "1");

229

tag_url_attributes[i].attr_name, "1");

238

230

}

239

231

240

232

/* Find the value of attribute named NAME in the taginfo TAG. If the

248

240

for (i = 0; i < tag->nattrs; i++)

249

241

if (!strcasecmp (tag->attrs[i].name, name))

250

242

{

251

if (attrind)

252

*attrind = i;

253

return tag->attrs[i].value;

243

if (attrind)

244

*attrind = i;

245

return tag->attrs[i].value;

254

246

}

255

247

return NULL;

256

248

}

257

249

258

250

struct map_context {

259

char *text; /* HTML text. */

260

char *base; /* Base URI of the document, possibly

261

changed through <base href=...>. */

262

const char *parent_base; /* Base of the current document. */

263

const char *document_file; /* File name of this document. */

264

int nofollow; /* whether NOFOLLOW was specified in a

251

char *text; /* HTML text. */

252

char *base; /* Base URI of the document, possibly

253

changed through <base href=...>. */

254

const char *parent_base; /* Base of the current document. */

255

const char *document_file; /* File name of this document. */

256

bool nofollow; /* whether NOFOLLOW was specified in a

265

257

266

258

267

struct urlpos *head, *tail; /* List of URLs that is being

268

built. */

259

struct urlpos *head, *tail; /* List of URLs that is being

260

built. */

269

261

};

270

262

271

263

/* Append LINK_URI to the urlpos structure that is being built.

276

268

277

269

static struct urlpos *

278

270

append_url (const char *link_uri,

279

struct taginfo *tag, int attrind, struct map_context *ctx)

271

struct taginfo *tag, int attrind, struct map_context *ctx)

280

272

{

281

273

int link_has_scheme = url_has_scheme (link_uri);

282

274

struct urlpos *newel;

286

278

if (!base)

287

279

{

288

280

DEBUGP (("%s: no base, merge will use \"%s\".\n",

289

ctx->document_file, link_uri));

281

ctx->document_file, link_uri));

290

282

291

283

if (!link_has_scheme)

292

{

293

/* Base URL is unavailable, and the link does not have a

294

location attached to it -- we have to give up. Since

295

this can only happen when using `--force-html -i', print

296

a warning. */

297

logprintf (LOG_NOTQUIET,

298

_("%s: Cannot resolve incomplete link %s.\n"),

299

ctx->document_file, link_uri);

300

return NULL;

301

}

284

{

285

/* Base URL is unavailable, and the link does not have a

286

location attached to it -- we have to give up. Since

287

this can only happen when using `--force-html -i', print

288

a warning. */

289

logprintf (LOG_NOTQUIET,

290

_("%s: Cannot resolve incomplete link %s.\n"),

291

ctx->document_file, link_uri);

292

return NULL;

293

}

302

294

303

295

url = url_parse (link_uri, NULL);

304

296

if (!url)

305

{

306

DEBUGP (("%s: link \"%s\" doesn't parse.\n",

307

ctx->document_file, link_uri));

308

return NULL;

309

}

297

{

298

DEBUGP (("%s: link \"%s\" doesn't parse.\n",

299

ctx->document_file, link_uri));

300

return NULL;

301

}

310

302

}

311

303

else

312

304

{

313

305

/* Merge BASE with LINK_URI, but also make sure the result is

314

canonicalized, i.e. that "../" have been resolved.

315

(parse_url will do that for us.) */

306

canonicalized, i.e. that "../" have been resolved.

307

(parse_url will do that for us.) */

316

308

317

309

char *complete_uri = uri_merge (base, link_uri);

318

310

319

311

DEBUGP (("%s: merge(\"%s\", \"%s\") -> %s\n",

320

ctx->document_file, base, link_uri, complete_uri));

312

ctx->document_file, base, link_uri, complete_uri));

321

313

322

314

url = url_parse (complete_uri, NULL);

323

315

if (!url)

324

{

325

DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",

326

ctx->document_file, complete_uri));

327

xfree (complete_uri);

328

return NULL;

329

}

316

{

317

DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",

318

ctx->document_file, complete_uri));

319

xfree (complete_uri);

320

return NULL;

321

}

330

322

xfree (complete_uri);

331

323

}

332

324

370

362

for (i = 0; i < countof (tag_url_attributes); i++)

371

363

if (tag_url_attributes[i].tagid == tagid)

372

364

{

373

/* We've found the index of tag_url_attributes where the

374

attributes of our tag begin. */

375

first = i;

376

break;

365

/* We've found the index of tag_url_attributes where the

366

attributes of our tag begin. */

367

first = i;

368

break;

377

369

}

378

370

assert (first != -1);

379

371

389

381

for (attrind = 0; attrind < tag->nattrs; attrind++)

390

382

{

391

383

/* Find whether TAG/ATTRIND is a combination that contains a

392

URL. */

384

URL. */

393

385

char *link = tag->attrs[attrind].value;

394

386

const int size = countof (tag_url_attributes);

395

387

396

388

/* If you're cringing at the inefficiency of the nested loops,

397

remember that they both iterate over a very small number of

398

items. The worst-case inner loop is for the IMG tag, which

399

has three attributes. */

389

remember that they both iterate over a very small number of

390

items. The worst-case inner loop is for the IMG tag, which

391

has three attributes. */

400

392

for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)

401

{

402

if (0 == strcasecmp (tag->attrs[attrind].name,

403

tag_url_attributes[i].attr_name))

404

{

405

struct urlpos *up = append_url (link, tag, attrind, ctx);

406

if (up)

407

{

408

int flags = tag_url_attributes[i].flags;

409

if (flags & ATTR_INLINE)

410

up->link_inline_p = 1;

411

if (flags & ATTR_HTML)

412

up->link_expect_html = 1;

413

}

414

}

415

}

393

{

394

if (0 == strcasecmp (tag->attrs[attrind].name,

395

tag_url_attributes[i].attr_name))

396

{

397

struct urlpos *up = append_url (link, tag, attrind, ctx);

398

if (up)

399

{

400

int flags = tag_url_attributes[i].flags;

401

if (flags & ATTR_INLINE)

402

up->link_inline_p = 1;

403

if (flags & ATTR_HTML)

404

up->link_expect_html = 1;

405

}

406

}

407

}

416

408

}

417

409

}

418

410

452

444

{

453

445

struct urlpos *up = append_url (action, tag, attrind, ctx);

454

446

if (up)

455

up->ignore_when_downloading = 1;

447

up->ignore_when_downloading = 1;

456

448

}

457

449

}

458

450

475

467

{

476

468

struct urlpos *up = append_url (href, tag, attrind, ctx);

477

469

if (up)

478

{

479

char *rel = find_attr (tag, "rel", NULL);

480

if (rel

481

&& (0 == strcasecmp (rel, "stylesheet")

482

|| 0 == strcasecmp (rel, "shortcut icon")))

483

up->link_inline_p = 1;

484

else

485

/* The external ones usually point to HTML pages, such as

486

487

up->link_expect_html = 1;

488

}

470

{

471

char *rel = find_attr (tag, "rel", NULL);

472

if (rel

473

&& (0 == strcasecmp (rel, "stylesheet")

474

|| 0 == strcasecmp (rel, "shortcut icon")))

475

up->link_inline_p = 1;

476

else

477

/* The external ones usually point to HTML pages, such as

478

479

up->link_expect_html = 1;

480

}

489

481

}

490

482

}

491

483

501

493

if (http_equiv && 0 == strcasecmp (http_equiv, "refresh"))

502

494

{

503

495

/* Some pages use a META tag to specify that the page be

504

refreshed by a new page after a given number of seconds. The

505

general format for this is:

506

507

508

509

So we just need to skip past the "NUMBER; URL=" garbage to

510

get to the URL. */

496

refreshed by a new page after a given number of seconds. The

497

general format for this is:

498

499

500

501

So we just need to skip past the "NUMBER; URL=" garbage to

502

get to the URL. */

511

503

512

504

struct urlpos *entry;

513

505

int attrind;

516

508

517

509

char *refresh = find_attr (tag, "content", &attrind);

518

510

if (!refresh)

519

return;

511

return;

520

512

521

513

for (p = refresh; ISDIGIT (*p); p++)

522

timeout = 10 * timeout + *p - '0';

514

timeout = 10 * timeout + *p - '0';

523

515

if (*p++ != ';')

524

return;

516

return;

525

517

526

518

while (ISSPACE (*p))

527

++p;

519

++p;

528

520

if (!( TOUPPER (*p) == 'U'

529

&& TOUPPER (*(p + 1)) == 'R'

530

&& TOUPPER (*(p + 2)) == 'L'

531

&& *(p + 3) == '='))

532

return;

521

&& TOUPPER (*(p + 1)) == 'R'

522

&& TOUPPER (*(p + 2)) == 'L'

523

&& *(p + 3) == '='))

524

return;

533

525

p += 4;

534

526

while (ISSPACE (*p))

535

++p;

527

++p;

536

528

537

529

entry = append_url (p, tag, attrind, ctx);

538

530

if (entry)

539

{

540

entry->link_refresh_p = 1;

541

entry->refresh_timeout = timeout;

542

entry->link_expect_html = 1;

543

}

531

{

532

entry->link_refresh_p = 1;

533

entry->refresh_timeout = timeout;

534

entry->link_expect_html = 1;

535

}

544

536

}

545

537

else if (name && 0 == strcasecmp (name, "robots"))

546

538

{

547

539

/* Handle stuff like:

548

540

549

541

char *content = find_attr (tag, "content", NULL);

550

542

if (!content)

551

return;

543

return;

552

544

if (!strcasecmp (content, "none"))

553

ctx->nofollow = 1;

545

ctx->nofollow = true;

554

546

else

555

{

556

while (*content)

557

{

558

/* Find the next occurrence of ',' or the end of

559

the string. */

560

char *end = strchr (content, ',');

561

if (end)

562

++end;

563

else

564

end = content + strlen (content);

565

if (!strncasecmp (content, "nofollow", end - content))

566

ctx->nofollow = 1;

567

content = end;

568

}

569

}

547

{

548

while (*content)

549

{

550

/* Find the next occurrence of ',' or the end of

551

the string. */

552

char *end = strchr (content, ',');

553

if (end)

554

++end;

555

else

556

end = content + strlen (content);

557

if (!strncasecmp (content, "nofollow", end - content))

558

ctx->nofollow = true;

559

content = end;

560

}

561

}

570

562

}

571

563

}

572

564

591

583

<base href=...> and does the right thing. */

592

584

593

585

struct urlpos *

594

get_urls_html (const char *file, const char *url, int *meta_disallow_follow)

586

get_urls_html (const char *file, const char *url, bool *meta_disallow_follow)

595

587

{

596

588

struct file_memory *fm;

597

589

struct map_context ctx;

611

603

ctx.base = NULL;

612

604

ctx.parent_base = url ? url : opt.base_href;

613

605

ctx.document_file = file;

614

ctx.nofollow = 0;

606

ctx.nofollow = false;

615

607

616

608

if (!interesting_tags)

617

609

init_interesting ();

628

620

flags |= MHT_STRICT_COMMENTS;

629

621

630

622

map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,

631

interesting_tags, interesting_attributes);

623

interesting_tags, interesting_attributes);

632

624

633

625

DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));

634

626

if (meta_disallow_follow)

671

663

const char *line_beg = text;

672

664

const char *line_end = memchr (text, '\n', text_end - text);

673

665

if (!line_end)

674

line_end = text_end;

666

line_end = text_end;

675

667

else

676

++line_end;

668

++line_end;

677

669

text = line_end;

678

670

679

671

/* Strip whitespace from the beginning and end of line. */

680

672

while (line_beg < line_end && ISSPACE (*line_beg))

681

++line_beg;

673

++line_beg;

682

674

while (line_end > line_beg && ISSPACE (*(line_end - 1)))

683

--line_end;

675

--line_end;

684

676

685

677

if (line_beg == line_end)

686

continue;

678

continue;

687

679

688

680

/* The URL is in the [line_beg, line_end) region. */

689

681

690

682

/* We must copy the URL to a zero-terminated string, and we

691

can't use alloca because we're in a loop. *sigh*. */

683

can't use alloca because we're in a loop. *sigh*. */

692

684

url_text = strdupdelim (line_beg, line_end);

693

685

694

686

if (opt.base_href)

695

{

696

/* Merge opt.base_href with URL. */

697

char *merged = uri_merge (opt.base_href, url_text);

698

xfree (url_text);

699

url_text = merged;

700

}

687

{

688

/* Merge opt.base_href with URL. */

689

char *merged = uri_merge (opt.base_href, url_text);

690

xfree (url_text);

691

url_text = merged;

692

}

701

693

702

694

url = url_parse (url_text, &up_error_code);

703

695

if (!url)

704

{

705

logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),

706

file, url_text, url_error (up_error_code));

707

xfree (url_text);

708

continue;

709

}

696

{

697

logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),

698

file, url_text, url_error (up_error_code));

699

xfree (url_text);

700

continue;

701

}

710

702

xfree (url_text);

711

703

712

704

entry = xnew0 (struct urlpos);

713

entry->next = NULL;

714

705

entry->url = url;

715

706

716

707

if (!head)

717

head = entry;

708

head = entry;

718

709

else

719

tail->next = entry;

710

tail->next = entry;

720

711

tail = entry;

721

712

}

722

713

read_file_free (fm);

Older »