~ubuntu-branches/ubuntu/precise/wget/precise-proposed

Viewing changes to src/url.c

Committer: Bazaar Package Importer
Author(s): Noèl Köthe
Date: 2005-06-26 16:46:25 UTC
mfrom: (1.1.1 upstream) (2.1.1 sarge)
Revision ID: james.westby@ubuntu.com-20050626164625-jjcde8hyztx7xq7o

Tags: 1.10-2

http://bugs.debian.org/314728

http://bugs.debian.org/163243

http://bugs.debian.org/313883

* wget-fix_error--save-headers patch from upstream
  (closes: Bug#314728)
* don't pattern-match server redirects patch from upstream
  (closes: Bug#163243)
* correct de.po typos
  (closes: Bug#313883)
* wget-E_html_behind_file_counting fix problem with adding the
  numbers after the html extension
* updated Standards-Version: to 3.6.2

files added:
ChangeLog-branches/1.8_branch.ChangeLog

ChangeLog-branches/1.9_branch.ChangeLog

debian/patches

debian/patches/00list

debian/patches/00template

debian/patches/wget-E_html_behind_file_counting

debian/patches/wget-de.po-spelling-correction

debian/patches/wget-doc-remove-usr-local-in-sample.wgetrc

debian/patches/wget-doc-remove-usr-local-in-wget.texi

debian/patches/wget-dont_pattern_match_server_redirects

debian/patches/wget-fix_error--save-headers

debian/patches/wget-fr.po-spelling-correction

debian/patches/wget-passive_ftp-default

doc/ChangeLog-branches/1.8_branch.ChangeLog

doc/ChangeLog-branches/1.9_branch.ChangeLog

doc/fdl.texi

po/en_GB.po

po/eo.po

po/eu.po

po/fi.po

po/ga.po

po/sr.po

po/vi.po

src/ChangeLog-branches/1.9_branch.ChangeLog

src/config-post.h

src/http-ntlm.c

src/http-ntlm.h

src/log.h

src/openssl.c

src/ptimer.c

src/ptimer.h

src/ssl.h

src/xmalloc.c

src/xmalloc.h

windows/ChangeLog

windows/Makefile.src.mingw

windows/Makefile.top.mingw

windows/config.h.mingw

files removed:
MACHINES

doc/wget.info-1

doc/wget.info-2

doc/wget.info-3

doc/wget.info-4

po/bg.gmo

po/ca.gmo

po/cs.gmo

po/da.gmo

po/de.gmo

po/el.gmo

po/es.gmo

po/et.gmo

po/fr.gmo

po/gl.gmo

po/he.gmo

po/hr.gmo

po/hu.gmo

po/it.gmo

po/ja.gmo

po/nl.gmo

po/no.gmo

po/pl.gmo

po/pt_BR.gmo

po/ro.gmo

po/ru.gmo

po/sk.gmo

po/sl.gmo

po/sv.gmo

po/tr.gmo

po/uk.gmo

po/wget.pot

po/zh_CN.gmo

po/zh_TW.gmo

src/gen_sslfunc.c

src/gen_sslfunc.h

src/headers.c

src/headers.h

src/rbuf.c

src/rbuf.h

files modified:
AUTHORS

ChangeLog

INSTALL

MAILING-LIST

Makefile.cvs

Makefile.in

NEWS

README

TODO

aclocal.m4

config.guess

config.sub

configure

configure.bat

configure.bat.in

configure.in

debian/changelog

debian/control

debian/rules

doc/ChangeLog

doc/Makefile.in

doc/sample.wgetrc

doc/sample.wgetrc.munged_for_texi_inclusion

doc/texi2pod.pl.in

doc/version.texi

doc/wget.info

doc/wget.texi

libtool.m4

ltmain.sh

po/Makefile.in.in

po/POTFILES.in

po/bg.po

po/ca.po

po/cs.po

po/da.po

po/de.po

po/el.po

po/es.po

po/et.po

po/fr.po

po/gl.po

po/he.po

po/hr.po

po/hu.po

po/it.po

po/ja.po

po/nl.po

po/no.po

po/pl.po

po/pt_BR.po

po/ro.po

po/ru.po

po/sk.po

po/sl.po

po/sv.po

po/tr.po

po/uk.po

po/zh_CN.po

po/zh_TW.po

src/ChangeLog

src/Makefile.in

src/alloca.c

src/cmpt.c

src/config.h.in

src/connect.c

src/connect.h

src/convert.c

src/convert.h

src/cookies.c

src/cookies.h

src/ftp-basic.c

src/ftp-ls.c

src/ftp-opie.c

src/ftp.c

src/ftp.h

src/gen-md5.c

src/gen-md5.h

src/getopt.h

src/gnu-md5.h

src/hash.c

src/hash.h

src/host.c

src/host.h

src/html-parse.c

src/html-url.c

src/http.c

src/init.c

src/init.h

src/log.c

src/main.c

src/mswindows.c

src/mswindows.h

src/netrc.c

src/options.h

src/progress.c

src/progress.h

src/recur.c

src/recur.h

src/res.c

src/retr.c

src/retr.h

src/snprintf.c

src/sysdep.h

src/url.c

src/utils.c

src/utils.h

src/version.c

src/wget.h

util/dist-wget

windows/Makefile.doc

windows/Makefile.src

windows/Makefile.src.bor

windows/Makefile.top

windows/Makefile.top.bor

windows/Makefile.watcom

windows/README

windows/config.h.bor

windows/config.h.ms

windows/wget.dep

Show diffs side-by-side

added added

removed removed

src/url.c

/* URL handling.

Free Software Foundation, Inc.

This file is part of GNU Wget.

#include "wget.h"

#include "utils.h"

#include "url.h"

#include "host.h" /* for is_valid_ipv6_address */

#ifndef errno

extern int errno;

struct scheme_data

{

char *leading_string;

const char *name;

const char *leading_string;

int default_port;

int enabled;

};

/* Supported schemes: */

static struct scheme_data supported_schemes[] =

{

{ "http://", DEFAULT_HTTP_PORT, 1 },

{ "http", "http://", DEFAULT_HTTP_PORT, 1 },

#ifdef HAVE_SSL

{ "https://", DEFAULT_HTTPS_PORT, 1 },

{ "https", "https://", DEFAULT_HTTPS_PORT, 1 },

#endif

{ "ftp://", DEFAULT_FTP_PORT, 1 },

{ "ftp", "ftp://", DEFAULT_FTP_PORT, 1 },

/* SCHEME_INVALID */

{ NULL, -1, 0 }

{ NULL, NULL, -1, 0 }

};

/* Forward declarations: */

static int path_simplify PARAMS ((char *));

/* Support for encoding and decoding of URL strings. We determine

whether a character is unsafe through static table lookup. This

code assumes ASCII character set and 8-bit chars. */

/* Support for escaping and unescaping of URL strings. */

/* Table of "reserved" and "unsafe" characters. Those terms are

rfc1738-speak, as such largely obsoleted by rfc2396 and later

specs, but the general idea remains.

A reserved character is the one that you can't decode without

changing the meaning of the URL. For example, you can't decode

"/foo/%2f/bar" into "/foo///bar" because the number and contents of

path components is different. Non-reserved characters can be

changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar". The

unsafe characters are loosely based on rfc1738, plus "$" and ",",

as recommended by rfc2396, and minus "~", which is very frequently

used (and sometimes unrecognized as %7E by broken servers).

An unsafe character is the one that should be encoded when URLs are

placed in foreign environments. E.g. space and newline are unsafe

in HTTP contexts because HTTP uses them as separator and line

terminator, so they must be encoded to %20 and %0A respectively.

"*" is unsafe in shell context, etc.

100

101

We determine whether a character is unsafe through static table

102

lookup. This code assumes ASCII character set and 8-bit chars. */

103

104

enum {

/* rfc1738 reserved chars, preserved from encoding. */

105

/* rfc1738 reserved chars + "$" and ",". */

106

urlchr_reserved = 1,

107

/* rfc1738 unsafe chars, plus some more. */

108

/* rfc1738 unsafe chars, plus non-printables. */

109

urlchr_unsafe = 2

110

};

111

118

#define U urlchr_unsafe

119

#define RU R|U

120

100

const static unsigned char urlchr_table[256] =

121

static const unsigned char urlchr_table[256] =

101

122

{

102

123

U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */

103

124

U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */

104

125

U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */

105

126

U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */

106

U, 0, U, RU, 0, U, R, 0, /* SP ! " # $ % & ' */

107

0, 0, 0, R, 0, 0, 0, R, /* ( ) * + , - . / */

127

U, 0, U, RU, R, U, R, 0, /* SP ! " # $ % & ' */

128

0, 0, 0, R, R, 0, 0, R, /* ( ) * + , - . / */

108

129

0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */

109

130

0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */

110

131

RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */

114

135

U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */

115

136

0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */

116

137

0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */

117

0, 0, 0, U, U, U, U, U, /* x y z { | } ~ DEL */

138

0, 0, 0, U, U, U, 0, U, /* x y z { | } ~ DEL */

118

139

119

140

U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,

120

141

U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U,

154

175

}

155

176

else

156

177

{

178

char c;

157

179

/* Do nothing if '%' is not followed by two hex digits. */

158

180

if (!h[1] || !h[2] || !(ISXDIGIT (h[1]) && ISXDIGIT (h[2])))

159

181

goto copychar;

160

*t = X2DIGITS_TO_NUM (h[1], h[2]);

182

c = X2DIGITS_TO_NUM (h[1], h[2]);

183

/* Don't unescape %00 because there is no way to insert it

184

into a C string without effectively truncating it. */

185

if (c == '\0')

186

goto copychar;

187

*t = c;

161

188

h += 2;

162

189

}

163

190

}

228

255

return url_escape_1 (s, urlchr_unsafe, 1);

229

256

}

230

257

231

enum copy_method { CM_DECODE, CM_ENCODE, CM_PASSTHROUGH };

232

233

/* Decide whether to encode, decode, or pass through the char at P.

234

This used to be a macro, but it got a little too convoluted. */

235

static inline enum copy_method

236

decide_copy_method (const char *p)

258

/* Decide whether the char at position P needs to be encoded. (It is

259

not enough to pass a single char *P because the function may need

260

to inspect the surrounding context.)

261

262

Return 1 if the char should be escaped as %XX, 0 otherwise. */

263

264

static inline int

265

char_needs_escaping (const char *p)

237

266

{

238

267

if (*p == '%')

239

268

{

240

269

if (ISXDIGIT (*(p + 1)) && ISXDIGIT (*(p + 2)))

241

{

242

/* %xx sequence: decode it, unless it would decode to an

243

unsafe or a reserved char; in that case, leave it as

244

is. */

245

char preempt = X2DIGITS_TO_NUM (*(p + 1), *(p + 2));

246

if (URL_UNSAFE_CHAR (preempt) || URL_RESERVED_CHAR (preempt))

247

return CM_PASSTHROUGH;

248

else

249

return CM_DECODE;

250

}

270

return 0;

251

271

else

252

272

/* Garbled %.. sequence: encode `%'. */

253

return CM_ENCODE;

273

return 1;

254

274

}

255

275

else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))

256

return CM_ENCODE;

276

return 1;

257

277

else

258

return CM_PASSTHROUGH;

278

return 0;

259

279

}

260

280

261

281

/* Translate a %-escaped (but possibly non-conformant) input string S

265

285

266

286

After a URL has been run through this function, the protocols that

267

287

use `%' as the quote character can use the resulting string as-is,

268

while those that don't call url_unescape() to get to the intended

269

data. This function is also stable: after an input string is

270

transformed the first time, all further transformations of the

271

result yield the same result string.

288

while those that don't can use url_unescape to get to the intended

289

data. This function is stable: once the input is transformed,

290

further transformations of the result yield the same output.

272

291

273

292

Let's discuss why this function is needed.

274

293

275

Imagine Wget is to retrieve `http://abc.xyz/abc def'. Since a raw

276

space character would mess up the HTTP request, it needs to be

277

quoted, like this:

294

Imagine Wget is asked to retrieve `http://abc.xyz/abc def'. Since

295

a raw space character would mess up the HTTP request, it needs to

296

be quoted, like this:

278

297

279

298

GET /abc%20def HTTP/1.0

280

299

281

It appears that the unsafe chars need to be quoted, for example

282

with url_escape. But what if we're requested to download

300

It would appear that the unsafe chars need to be quoted, for

301

example with url_escape. But what if we're requested to download

283

302

`abc%20def'? url_escape transforms "%" to "%25", which would leave

284

303

us with `abc%2520def'. This is incorrect -- since %-escapes are

285

304

part of URL syntax, "%20" is the correct way to denote a literal

286

space on the Wget command line. This leaves us in the conclusion

287

that in that case Wget should not call url_escape, but leave the

288

`%20' as is.

305

space on the Wget command line. This leads to the conclusion that

306

in that case Wget should not call url_escape, but leave the `%20'

307

as is. This is clearly contradictory, but it only gets worse.

289

308

290

And what if the requested URI is `abc%20 def'? If we call

291

url_escape, we end up with `/abc%2520%20def', which is almost

292

certainly not intended. If we don't call url_escape, we are left

293

with the embedded space and cannot complete the request. What the

294

user meant was for Wget to request `/abc%20%20def', and this is

295

where reencode_escapes kicks in.

309

What if the requested URI is `abc%20 def'? If we call url_escape,

310

we end up with `/abc%2520%20def', which is almost certainly not

311

intended. If we don't call url_escape, we are left with the

312

embedded space and cannot complete the request. What the user

313

meant was for Wget to request `/abc%20%20def', and this is where

314

reencode_escapes kicks in.

296

315

297

316

Wget used to solve this by first decoding %-quotes, and then

298

317

encoding all the "unsafe" characters found in the resulting string.

306

325

literal plus. reencode_escapes correctly translates the above to

307

326

"a%2B+b", i.e. returns the original string.

308

327

309

This function uses an algorithm proposed by Anon Sricharoenchai:

310

311

1. Encode all URL_UNSAFE and the "%" that are not followed by 2

312

hexdigits.

313

314

2. Decode all "%XX" except URL_UNSAFE, URL_RESERVED (";/?:@=&") and

315

"+".

316

317

...except that this code conflates the two steps, and decides

318

whether to encode, decode, or pass through each character in turn.

319

The function still uses two passes, but their logic is the same --

320

the first pass exists merely for the sake of allocation. Another

321

small difference is that we include `+' to URL_RESERVED.

328

This function uses a modified version of the algorithm originally

329

proposed by Anon Sricharoenchai:

330

331

* Encode all "unsafe" characters, except those that are also

332

"reserved", to %XX. See urlchr_table for which characters are

333

unsafe and reserved.

334

335

* Encode the "%" characters not followed by two hex digits to

336

"%25".

337

338

* Pass through all other characters and %XX escapes as-is. (Up to

339

Wget 1.10 this decoded %XX escapes corresponding to "safe"

340

characters, but that was obtrusive and broke some servers.)

322

341

323

342

Anon's test case:

324

343

325

344

"http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"

326

345

327

"http://abc.xyz/%20%3F%2561%25aa%25%20a?a=a+a%2Ba&b=b%26c%3Dc"

346

"http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"

328

347

329

348

Simpler test cases:

330

349

345

364

int oldlen, newlen;

346

365

347

366

int encode_count = 0;

348

int decode_count = 0;

349

367

350

/* First, pass through the string to see if there's anything to do,

368

/* First pass: inspect the string to see if there's anything to do,

351

369

and to calculate the new length. */

352

370

for (p1 = s; *p1; p1++)

353

{

354

switch (decide_copy_method (p1))

355

{

356

case CM_ENCODE:

357

++encode_count;

358

break;

359

case CM_DECODE:

360

++decode_count;

361

break;

362

case CM_PASSTHROUGH:

363

break;

364

}

365

}

371

if (char_needs_escaping (p1))

372

++encode_count;

366

373

367

if (!encode_count && !decode_count)

374

if (!encode_count)

368

375

/* The string is good as it is. */

369

return (char *)s; /* C const model sucks. */

376

return (char *) s; /* C const model sucks. */

370

377

371

378

oldlen = p1 - s;

372

/* Each encoding adds two characters (hex digits), while each

373

decoding removes two characters. */

374

newlen = oldlen + 2 * (encode_count - decode_count);

379

/* Each encoding adds two characters (hex digits). */

380

newlen = oldlen + 2 * encode_count;

375

381

newstr = xmalloc (newlen + 1);

376

382

383

/* Second pass: copy the string to the destination address, encoding

384

chars when needed. */

377

385

p1 = s;

378

386

p2 = newstr;

379

387

380

388

while (*p1)

381

{

382

switch (decide_copy_method (p1))

383

{

384

case CM_ENCODE:

385

{

386

unsigned char c = *p1++;

387

*p2++ = '%';

388

*p2++ = XNUM_TO_DIGIT (c >> 4);

389

*p2++ = XNUM_TO_DIGIT (c & 0xf);

390

}

391

break;

392

case CM_DECODE:

393

*p2++ = X2DIGITS_TO_NUM (p1[1], p1[2]);

394

p1 += 3; /* skip %xx */

395

break;

396

case CM_PASSTHROUGH:

397

*p2++ = *p1++;

398

}

399

}

389

if (char_needs_escaping (p1))

390

{

391

unsigned char c = *p1++;

392

*p2++ = '%';

393

*p2++ = XNUM_TO_DIGIT (c >> 4);

394

*p2++ = XNUM_TO_DIGIT (c & 0xf);

395

}

396

else

397

*p2++ = *p1++;

398

400

399

*p2 = '\0';

401

400

assert (p2 - newstr == newlen);

402

401

return newstr;

457

456

supported_schemes[scheme].enabled = 0;

458

457

}

459

458

460

/* Skip the username and password, if present here. The function

461

should *not* be called with the complete URL, but with the part

462

right after the scheme.

463

464

If no username and password are found, return 0. */

465

466

static int

459

/* Skip the username and password, if present in the URL. The

460

function should *not* be called with the complete URL, but with the

461

portion after the scheme.

462

463

If no username and password are found, return URL. */

464

465

static const char *

467

466

url_skip_credentials (const char *url)

468

467

{

469

468

/* Look for '@' that comes before terminators, such as '/', '?',

470

469

'#', or ';'. */

471

470

const char *p = (const char *)strpbrk (url, "@/?#;");

472

471

if (!p || *p != '@')

473

return 0;

474

return p + 1 - url;

472

return url;

473

return p + 1;

475

474

}

476

475

477

476

/* Parse credentials contained in [BEG, END). The region is expected

524

523

{

525

524

const char *p;

526

525

527

if (url_has_scheme (url))

526

if (url_scheme (url) != SCHEME_INVALID)

528

527

return NULL;

529

528

530

529

/* Look for a ':' or '/'. The former signifies NcFTP syntax, the

535

534

if (p == url)

536

535

return NULL;

537

536

537

/* If we're looking at "://", it means the URL uses a scheme we

538

don't support, which may include "https" when compiled without

539

SSL support. Don't bogusly rewrite such URLs. */

540

if (p[0] == ':' && p[1] == '/' && p[2] == '/')

541

return NULL;

542

538

543

if (*p == ':')

539

544

{

540

545

const char *pp;

579

584

help because the check for literal accept is in the

580

585

preprocessor.) */

581

586

582

#ifdef __GNUC__

587

#if defined(__GNUC__) && __GNUC__ >= 3

583

588

584

589

#define strpbrk_or_eos(s, accept) ({ \

585

590

char *SOE_p = strpbrk (s, accept); \

586

591

if (!SOE_p) \

587

SOE_p = (char *)s + strlen (s); \

592

SOE_p = strchr (s, '\0'); \

588

593

SOE_p; \

589

594

})

590

595

591

#else /* not __GNUC__ */

596

#else /* not __GNUC__ or old gcc */

592

597

593

static char *

598

static inline char *

594

599

strpbrk_or_eos (const char *s, const char *accept)

595

600

{

596

601

char *p = strpbrk (s, accept);

597

602

if (!p)

598

p = (char *)s + strlen (s);

603

p = strchr (s, '\0');

599

604

return p;

600

605

}

601

#endif

606

#endif /* not __GNUC__ or old gcc */

602

607

603

608

/* Turn STR into lowercase; return non-zero if a character was

604

609

actually changed. */

616

621

return change;

617

622

}

618

623

619

static char *parse_errors[] = {

624

static const char *parse_errors[] = {

620

625

#define PE_NO_ERROR 0

621

626

N_("No error"),

622

627

#define PE_UNSUPPORTED_SCHEME 1

635

640

N_("Invalid IPv6 numeric address")

636

641

};

637

642

638

#ifdef ENABLE_IPV6

639

/* The following two functions were adapted from glibc. */

640

641

static int

642

is_valid_ipv4_address (const char *str, const char *end)

643

{

644

int saw_digit, octets;

645

int val;

646

647

saw_digit = 0;

648

octets = 0;

649

val = 0;

650

651

while (str < end) {

652

int ch = *str++;

653

654

if (ch >= '0' && ch <= '9') {

655

val = val * 10 + (ch - '0');

656

657

if (val > 255)

658

return 0;

659

if (saw_digit == 0) {

660

if (++octets > 4)

661

return 0;

662

saw_digit = 1;

663

}

664

} else if (ch == '.' && saw_digit == 1) {

665

if (octets == 4)

666

return 0;

667

val = 0;

668

saw_digit = 0;

669

} else

670

return 0;

671

}

672

if (octets < 4)

673

return 0;

674

675

return 1;

676

}

677

678

static const int NS_INADDRSZ = 4;

679

static const int NS_IN6ADDRSZ = 16;

680

static const int NS_INT16SZ = 2;

681

682

static int

683

is_valid_ipv6_address (const char *str, const char *end)

684

{

685

static const char xdigits[] = "0123456789abcdef";

686

const char *curtok;

687

int tp;

688

const char *colonp;

689

int saw_xdigit;

690

unsigned int val;

691

692

tp = 0;

693

colonp = NULL;

694

695

if (str == end)

696

return 0;

697

698

/* Leading :: requires some special handling. */

699

if (*str == ':')

700

{

701

++str;

702

if (str == end || *str != ':')

703

return 0;

704

}

705

706

curtok = str;

707

saw_xdigit = 0;

708

val = 0;

709

710

while (str < end) {

711

int ch = *str++;

712

const char *pch;

713

714

/* if ch is a number, add it to val. */

715

pch = strchr(xdigits, ch);

716

if (pch != NULL) {

717

val <<= 4;

718

val |= (pch - xdigits);

719

if (val > 0xffff)

720

return 0;

721

saw_xdigit = 1;

722

continue;

723

}

724

725

/* if ch is a colon ... */

726

if (ch == ':') {

727

curtok = str;

728

if (saw_xdigit == 0) {

729

if (colonp != NULL)

730

return 0;

731

colonp = str + tp;

732

continue;

733

} else if (str == end) {

734

return 0;

735

}

736

if (tp > NS_IN6ADDRSZ - NS_INT16SZ)

737

return 0;

738

tp += NS_INT16SZ;

739

saw_xdigit = 0;

740

val = 0;

741

continue;

742

}

743

744

/* if ch is a dot ... */

745

if (ch == '.' && (tp <= NS_IN6ADDRSZ - NS_INADDRSZ) &&

746

is_valid_ipv4_address(curtok, end) == 1) {

747

tp += NS_INADDRSZ;

748

saw_xdigit = 0;

749

break;

750

}

751

752

return 0;

753

}

754

755

if (saw_xdigit == 1) {

756

if (tp > NS_IN6ADDRSZ - NS_INT16SZ)

757

return 0;

758

tp += NS_INT16SZ;

759

}

760

761

if (colonp != NULL) {

762

if (tp == NS_IN6ADDRSZ)

763

return 0;

764

tp = NS_IN6ADDRSZ;

765

}

766

767

if (tp != NS_IN6ADDRSZ)

768

return 0;

769

770

return 1;

771

}

772

#endif

773

774

643

/* Parse a URL.

775

644

776

645

Return a new struct url if successful, NULL on error. In case of

803

672

if (scheme == SCHEME_INVALID)

804

673

{

805

674

error_code = PE_UNSUPPORTED_SCHEME;

806

goto error;

675

goto err;

807

676

}

808

677

809

678

url_encoded = reencode_escapes (url);

811

680

812

681

p += strlen (supported_schemes[scheme].leading_string);

813

682

uname_b = p;

814

p += url_skip_credentials (p);

683

p = url_skip_credentials (p);

815

684

uname_e = p;

816

685

817

686

/* scheme://user:pass@host[:port]... */

841

710

if (!host_e)

842

711

{

843

712

error_code = PE_UNTERMINATED_IPV6_ADDRESS;

844

goto error;

713

goto err;

845

714

}

846

715

847

716

#ifdef ENABLE_IPV6

849

718

if (!is_valid_ipv6_address(host_b, host_e))

850

719

{

851

720

error_code = PE_INVALID_IPV6_ADDRESS;

852

goto error;

721

goto err;

853

722

}

854

723

855

724

/* Continue parsing after the closing ']'. */

856

725

p = host_e + 1;

857

726

#else

858

727

error_code = PE_IPV6_NOT_SUPPORTED;

859

goto error;

728

goto err;

860

729

#endif

861

730

}

862

731

else

868

737

if (host_b == host_e)

869

738

{

870

739

error_code = PE_EMPTY_HOST;

871

goto error;

740

goto err;

872

741

}

873

742

874

743

port = scheme_default_port (scheme);

883

752

p = strpbrk_or_eos (p, "/;?#");

884

753

port_e = p;

885

754

886

if (port_b == port_e)

887

{

888

/* http://host:/whatever */

889

/* ^ */

890

error_code = PE_BAD_PORT_NUMBER;

891

goto error;

892

}

893

894

for (port = 0, pp = port_b; pp < port_e; pp++)

895

{

896

if (!ISDIGIT (*pp))

755

/* Allow empty port, as per rfc2396. */

756

if (port_b != port_e)

757

{

758

for (port = 0, pp = port_b; pp < port_e; pp++)

897

759

{

898

/* http://host:12randomgarbage/blah */

899

/* ^ */

900

error_code = PE_BAD_PORT_NUMBER;

901

goto error;

760

if (!ISDIGIT (*pp))

761

{

762

/* http://host:12randomgarbage/blah */

763

/* ^ */

764

error_code = PE_BAD_PORT_NUMBER;

765

goto err;

766

}

767

port = 10 * port + (*pp - '0');

768

/* Check for too large port numbers here, before we have

769

a chance to overflow on bogus port values. */

770

if (port > 65535)

771

{

772

error_code = PE_BAD_PORT_NUMBER;

773

goto err;

774

}

902

775

}

903

904

port = 10 * port + (*pp - '0');

905

776

}

906

777

}

907

778

958

829

if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))

959

830

{

960

831

error_code = PE_INVALID_USER_NAME;

961

goto error;

832

goto err;

962

833

}

963

834

}

964

835

965

u = (struct url *)xmalloc (sizeof (struct url));

966

memset (u, 0, sizeof (*u));

967

836

u = xnew0 (struct url);

968

837

u->scheme = scheme;

969

838

u->host = strdupdelim (host_b, host_e);

970

839

u->port = port;

977

846

978

847

host_modified = lowercase_str (u->host);

979

848

849

/* Decode %HH sequences in host name. This is important not so much

850

to support %HH sequences in host names (which other browser

851

don't), but to support binary characters (which will have been

852

converted to %HH by reencode_escapes). */

853

if (strchr (u->host, '%'))

854

{

855

url_unescape (u->host);

856

host_modified = 1;

857

}

858

980

859

if (params_b)

981

860

u->params = strdupdelim (params_b, params_e);

982

861

if (query_b)

1001

880

else

1002

881

u->url = url_encoded;

1003

882

}

1004

url_encoded = NULL;

1005

883

1006

884

return u;

1007

885

1008

error:

886

err:

1009

887

/* Cleanup in case of error: */

1010

888

if (url_encoded && url_encoded != url)

1011

889

xfree (url_encoded);

1115

993

url_full_path (const struct url *url)

1116

994

{

1117

995

int length = full_path_length (url);

1118

char *full_path = (char *)xmalloc(length + 1);

996

char *full_path = (char *) xmalloc (length + 1);

1119

997

1120

998

full_path_write (url, full_path);

1121

999

full_path[length] = '\0';

1123

1001

return full_path;

1124

1002

}

1125

1003

1004

/* Unescape CHR in an otherwise escaped STR. Used to selectively

1005

escaping of certain characters, such as "/" and ":". Returns a

1006

count of unescaped chars. */

1007

1008

static void

1009

unescape_single_char (char *str, char chr)

1010

{

1011

const char c1 = XNUM_TO_DIGIT (chr >> 4);

1012

const char c2 = XNUM_TO_DIGIT (chr & 0xf);

1013

char *h = str; /* hare */

1014

char *t = str; /* tortoise */

1015

for (; *h; h++, t++)

1016

{

1017

if (h[0] == '%' && h[1] == c1 && h[2] == c2)

1018

{

1019

*t = chr;

1020

h += 2;

1021

}

1022

else

1023

*t = *h;

1024

}

1025

*t = '\0';

1026

}

1027

1126

1028

/* Escape unsafe and reserved characters, except for the slash

1127

1029

characters. */

1128

1030

1130

1032

url_escape_dir (const char *dir)

1131

1033

{

1132

1034

char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);

1133

char *h, *t;

1134

1035

if (newdir == dir)

1135

1036

return (char *)dir;

1136

1037

1137

/* Unescape slashes in NEWDIR. */

1138

1139

h = newdir; /* hare */

1140

t = newdir; /* tortoise */

1141

1142

for (; *h; h++, t++)

1143

{

1144

/* url_escape_1 having converted '/' to "%2F" exactly. */

1145

if (*h == '%' && h[1] == '2' && h[2] == 'F')

1146

{

1147

*t = '/';

1148

h += 2;

1149

}

1150

else

1151

*t = *h;

1152

}

1153

*t = '\0';

1154

1038

unescape_single_char (newdir, '/');

1155

1039

return newdir;

1156

1040

}

1157

1041

1188

1072

*p++ = '/';

1189

1073

memcpy (p, efile, filelen);

1190

1074

p += filelen;

1191

*p++ = '\0';

1075

*p = '\0';

1192

1076

}

1193

1077

1194

1078

u->path = newpath;

1229

1113

xfree (url->path);

1230

1114

xfree (url->url);

1231

1115

1232

FREE_MAYBE (url->params);

1233

FREE_MAYBE (url->query);

1234

FREE_MAYBE (url->fragment);

1235

FREE_MAYBE (url->user);

1236

FREE_MAYBE (url->passwd);

1116

xfree_null (url->params);

1117

xfree_null (url->query);

1118

xfree_null (url->fragment);

1119

xfree_null (url->user);

1120

xfree_null (url->passwd);

1237

1121

1238

1122

xfree (url->dir);

1239

1123

xfree (url->file);

1242

1126

}

1243

1127

1244

1128

/* Create all the necessary directories for PATH (a file). Calls

1245

mkdirhier() internally. */

1129

make_directory internally. */

1246

1130

int

1247

1131

mkalldirs (const char *path)

1248

1132

{

1249

1133

const char *p;

1250

1134

char *t;

1251

struct stat st;

1135

struct_stat st;

1252

1136

int res;

1253

1137

1254

1138

p = path + strlen (path);

1370

1254

translate file name back to URL, this would become important

1371

1255

crucial. Right now, it's better to be minimal in escaping. */

1372

1256

1373

const static unsigned char filechr_table[256] =

1257

static const unsigned char filechr_table[256] =

1374

1258

{

1375

1259

UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */

1376

1260

C, C, C, C, C, C, C, C, /* BS HT LF VT FF CR SO SI */

1448

1332

e = unescaped + strlen (unescaped);

1449

1333

}

1450

1334

1335

/* Defang ".." when found as component of path. Remember that path

1336

comes from the URL and might contain malicious input. */

1337

if (e - b == 2 && b[0] == '.' && b[1] == '.')

1338

{

1339

b = "%2E%2E";

1340

e = b + 6;

1341

}

1342

1451

1343

/* Walk the PATHEL string and check how many characters we'll need

1452

to add for file quoting. */

1344

to quote. */

1453

1345

quoted = 0;

1454

1346

for (p = b; p < e; p++)

1455

1347

if (FILE_CHAR_TEST (*p, mask))

1456

1348

++quoted;

1457

1349

1458

/* e-b is the string length. Each quoted char means two additional

1350

/* Calculate the length of the output string. e-b is the input

1351

string length. Each quoted char introduces two additional

1459

1352

characters in the string, hence 2*quoted. */

1460

1353

outlen = (e - b) + (2 * quoted);

1461

1354

GROW (dest, outlen);

1462

1355

1463

1356

if (!quoted)

1464

1357

{

1465

/* If there's nothing to quote, we don't need to go through the

1466

string the second time. */

1358

/* If there's nothing to quote, we can simply append the string

1359

without processing it again. */

1467

1360

memcpy (TAIL (dest), b, outlen);

1468

1361

}

1469

1362

else

1530

1423

char *

1531

1424

url_file_name (const struct url *u)

1532

1425

{

1533

struct growable fnres;

1426

struct growable fnres; /* stands for "file name result" */

1534

1427

1535

char *u_file, *u_query;

1428

const char *u_file, *u_query;

1536

1429

char *fname, *unique;

1537

1430

1538

1431

fnres.base = NULL;

1548

1441

directory structure. */

1549

1442

if (opt.dirstruct)

1550

1443

{

1444

if (opt.protocol_directories)

1445

{

1446

if (fnres.tail)

1447

append_char ('/', &fnres);

1448

append_string (supported_schemes[u->scheme].name, &fnres);

1449

}

1551

1450

if (opt.add_hostdir)

1552

1451

{

1553

1452

if (fnres.tail)

1554

1453

append_char ('/', &fnres);

1555

append_string (u->host, &fnres);

1454

if (0 != strcmp (u->host, ".."))

1455

append_string (u->host, &fnres);

1456

else

1457

/* Host name can come from the network; malicious DNS may

1458

allow ".." to be resolved, causing us to write to

1459

"../<file>". Defang such host names. */

1460

append_string ("%2E%2E", &fnres);

1556

1461

if (u->port != scheme_default_port (u->scheme))

1557

1462

{

1558

1463

char portstr[24];

1602

1507

xfree (fname);

1603

1508

return unique;

1604

1509

}

1605

1606

/* Return the length of URL's path. Path is considered to be

1607

terminated by one of '?', ';', '#', or by the end of the

1608

string. */

1609

static int

1610

path_length (const char *url)

1611

{

1612

const char *q = strpbrk_or_eos (url, "?;#");

1613

return q - url;

1614

}

1615

1616

/* Find the last occurrence of character C in the range [b, e), or

1617

NULL, if none are present. This is equivalent to strrchr(b, c),

1618

except that it accepts an END argument instead of requiring the

1619

string to be zero-terminated. Why is there no memrchr()? */

1620

static const char *

1621

find_last_char (const char *b, const char *e, char c)

1622

{

1623

for (; e > b; e--)

1624

if (*e == c)

1625

return e;

1626

return NULL;

1627

}

1628

1510

1629

1511

/* Resolve "." and ".." elements of PATH by destructively modifying

1630

1512

PATH and return non-zero if PATH has been modified, zero otherwise.

1635

1517

"back up one element". Single leading and trailing slashes are

1636

1518

preserved.

1637

1519

1638

This function does not handle URL escapes explicitly. If you're

1639

passing paths from URLs, make sure to unquote "%2e" and "%2E" to

1640

".", so that this function can find the dots. (Wget's URL parser

1641

calls reencode_escapes, which see.)

1642

1643

1520

For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive

1644

1521

test examples are provided below. If you change anything in this

1645

1522

function, run test_path_simplify to make sure you haven't broken a

1648

1525

static int

1649

1526

path_simplify (char *path)

1650

1527

{

1651

char *h, *t, *end;

1652

1653

/* Preserve the leading '/'. */

1654

if (path[0] == '/')

1655

++path;

1656

1657

h = path; /* hare */

1658

t = path; /* tortoise */

1659

end = path + strlen (path);

1528

char *h = path; /* hare */

1529

char *t = path; /* tortoise */

1530

char *beg = path; /* boundary for backing the tortoise */

1531

char *end = path + strlen (path);

1660

1532

1661

1533

while (h < end)

1662

1534

{

1670

1542

else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))

1671

1543

{

1672

1544

/* Handle "../" by retreating the tortoise by one path

1673

element -- but not past beggining of PATH. */

1674

if (t > path)

1545

element -- but not past beggining. */

1546

if (t > beg)

1675

1547

{

1676

1548

/* Move backwards until T hits the beginning of the

1677

1549

previous path element or the beginning of path. */

1678

for (--t; t > path && t[-1] != '/'; t--)

1550

for (--t; t > beg && t[-1] != '/'; t--)

1679

1551

;

1680

1552

}

1553

else

1554

{

1555

/* If we're at the beginning, copy the "../" literally

1556

move the beginning so a later ".." doesn't remove

1557

it. */

1558

beg = t + 3;

1559

goto regular;

1560

}

1681

1561

h += 3;

1682

1562

}

1683

else if (*h == '/')

1684

{

1685

/* Ignore empty path elements. Supporting them well is hard

1686

(where do you save "http://x.com///y.html"?), and they

1687

don't bring any practical gain. Plus, they break our

1688

filesystem-influenced assumptions: allowing them would

1689

make "x/y//../z" simplify to "x/y/z", whereas most people

1690

would expect "x/z". */

1691

++h;

1692

}

1693

1563

else

1694

1564

{

1565

regular:

1695

1566

/* A regular path element. If H hasn't advanced past T,

1696

1567

simply skip to the next path element. Otherwise, copy

1697

1568

the path element until the next slash. */

1720

1591

return t != h;

1721

1592

}

1722

1593

1594

/* Return the length of URL's path. Path is considered to be

1595

terminated by one of '?', ';', '#', or by the end of the

1596

string. */

1597

1598

static int

1599

path_length (const char *url)

1600

{

1601

const char *q = strpbrk_or_eos (url, "?;#");

1602

return q - url;

1603

}

1604

1605

/* Find the last occurrence of character C in the range [b, e), or

1606

NULL, if none are present. We might want to use memrchr (a GNU

1607

extension) under GNU libc. */

1608

1609

static const char *

1610

find_last_char (const char *b, const char *e, char c)

1611

{

1612

for (; e > b; e--)

1613

if (*e == c)

1614

return e;

1615

return NULL;

1616

}

1617

1723

1618

/* Merge BASE with LINK and return the resulting URI.

1724

1619

1725

1620

Either of the URIs may be absolute or relative, complete with the

1727

1622

foreseeable cases. It only employs minimal URL parsing, without

1728

1623

knowledge of the specifics of schemes.

1729

1624

1730

Perhaps this function should call path_simplify so that the callers

1731

don't have to call url_parse unconditionally. */

1625

I briefly considered making this function call path_simplify after

1626

the merging process, as rfc1738 seems to suggest. This is a bad

1627

idea for several reasons: 1) it complexifies the code, and 2)

1628

url_parse has to simplify path anyway, so it's wasteful to boot. */

1732

1629

1733

1630

char *

1734

1631

uri_merge (const char *base, const char *link)

1878

1775

const char *last_slash = find_last_char (base, end, '/');

1879

1776

if (!last_slash)

1880

1777

{

1881

/* No slash found at all. Append LINK to what we have,

1882

but we'll need a slash as a separator.

1883

1884

Example: if base == "foo" and link == "qux/xyzzy", then

1885

we cannot just append link to base, because we'd get

1886

"fooqux/xyzzy", whereas what we want is

1887

"foo/qux/xyzzy".

1888

1889

To make sure the / gets inserted, we set

1890

need_explicit_slash to 1. We also set start_insert

1891

to end + 1, so that the length calculations work out

1892

correctly for one more (slash) character. Accessing

1893

that character is fine, since it will be the

1894

delimiter, '\0' or '?'. */

1895

/* example: "foo?..." */

1896

/* ^ ('?' gets changed to '/') */

1897

start_insert = end + 1;

1898

need_explicit_slash = 1;

1778

/* No slash found at all. Replace what we have with LINK. */

1779

start_insert = base;

1899

1780

}

1900

1781

else if (last_slash && last_slash >= base + 2

1901

1782

&& last_slash[-2] == ':' && last_slash[-1] == '/')

1949

1830

{

1950

1831

int size;

1951

1832

char *result, *p;

1952

char *quoted_user = NULL, *quoted_passwd = NULL;

1833

char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;

1953

1834

1954

1835

int scheme_port = supported_schemes[url->scheme].default_port;

1955

char *scheme_str = supported_schemes[url->scheme].leading_string;

1836

const char *scheme_str = supported_schemes[url->scheme].leading_string;

1956

1837

int fplen = full_path_length (url);

1957

1838

1958

int brackets_around_host = 0;

1839

int brackets_around_host;

1959

1840

1960

1841

assert (scheme_str != NULL);

1961

1842

1972

1853

}

1973

1854

}

1974

1855

1975

if (strchr (url->host, ':'))

1976

brackets_around_host = 1;

1856

/* In the unlikely event that the host name contains non-printable

1857

characters, quote it for displaying to the user. */

1858

quoted_host = url_escape_allow_passthrough (url->host);

1859

1860

/* Undo the quoting of colons that URL escaping performs. IPv6

1861

addresses may legally contain colons, and in that case must be

1862

placed in square brackets. */

1863

if (quoted_host != url->host)

1864

unescape_single_char (quoted_host, ':');

1865

brackets_around_host = strchr (quoted_host, ':') != NULL;

1977

1866

1978

1867

size = (strlen (scheme_str)

1979

+ strlen (url->host)

1868

+ strlen (quoted_host)

1980

1869

+ (brackets_around_host ? 2 : 0)

1981

1870

+ fplen

1982

1871

+ 1);

2005

1894

2006

1895

if (brackets_around_host)

2007

1896

*p++ = '[';

2008

APPEND (p, url->host);

1897

APPEND (p, quoted_host);

2009

1898

if (brackets_around_host)

2010

1899

*p++ = ']';

2011

1900

if (url->port != scheme_port)

2022

1911

2023

1912

if (quoted_user && quoted_user != url->user)

2024

1913

xfree (quoted_user);

2025

if (quoted_passwd && !hide_password

2026

&& quoted_passwd != url->passwd)

1914

if (quoted_passwd && !hide_password && quoted_passwd != url->passwd)

2027

1915

xfree (quoted_passwd);

1916

if (quoted_host != url->host)

1917

xfree (quoted_host);

2028

1918

2029

1919

return result;

2030

1920

}

2074

1964

if (modified != expected_change)

2075

1965

{

2076

1966

if (expected_change == 1)

1967

printf ("Expected modification with path_simplify(\"%s\").\n",

1968

test);

1969

else

2077

1970

printf ("Expected no modification with path_simplify(\"%s\").\n",

2078

1971

test);

2079

else

2080

printf ("Expected modification with path_simplify(\"%s\").\n",

2081

test);

2082

1972

}

2083

1973

xfree (test_copy);

2084

1974

}

2090

1980

char *test, *result;

2091

1981

int should_modify;

2092

1982

} tests[] = {

2093

{ "", "", 0 },

2094

{ ".", "", 1 },

2095

{ "..", "", 1 },

2096

{ "foo", "foo", 0 },

2097

{ "foo/bar", "foo/bar", 0 },

2098

{ "foo///bar", "foo/bar", 1 },

2099

{ "foo/.", "foo/", 1 },

2100

{ "foo/./", "foo/", 1 },

2101

{ "foo./", "foo./", 0 },

2102

{ "foo/../bar", "bar", 1 },

2103

{ "foo/../bar/", "bar/", 1 },

2104

{ "foo/bar/..", "foo/", 1 },

2105

{ "foo/bar/../x", "foo/x", 1 },

2106

{ "foo/bar/../x/", "foo/x/", 1 },

2107

{ "foo/..", "", 1 },

2108

{ "foo/../..", "", 1 },

2109

{ "a/b/../../c", "c", 1 },

2110

{ "./a/../b", "b", 1 }

1983

{ "", "", 0 },

1984

{ ".", "", 1 },

1985

{ "./", "", 1 },

1986

{ "..", "..", 0 },

1987

{ "../", "../", 0 },

1988

{ "foo", "foo", 0 },

1989

{ "foo/bar", "foo/bar", 0 },

1990

{ "foo///bar", "foo///bar", 0 },

1991

{ "foo/.", "foo/", 1 },

1992

{ "foo/./", "foo/", 1 },

1993

{ "foo./", "foo./", 0 },

1994

{ "foo/../bar", "bar", 1 },

1995

{ "foo/../bar/", "bar/", 1 },

1996

{ "foo/bar/..", "foo/", 1 },

1997

{ "foo/bar/../x", "foo/x", 1 },

1998

{ "foo/bar/../x/", "foo/x/", 1 },

1999

{ "foo/..", "", 1 },

2000

{ "foo/../..", "..", 1 },

2001

{ "foo/../../..", "../..", 1 },

2002

{ "foo/../../bar/../../baz", "../../baz", 1 },

2003

{ "a/b/../../c", "c", 1 },

2004

{ "./a/../b", "b", 1 }

2111

2005

};

2112

2006

int i;

2113

2007

2118

2012

int expected_change = tests[i].should_modify;

2119

2013

run_test (test, expected_result, expected_change);

2120

2014

}

2121

2122

/* Now run all the tests with a leading slash before the test case,

2123

to prove that the slash is being preserved. */

2124

for (i = 0; i < countof (tests); i++)

2125

{

2126

char *test, *expected_result;

2127

int expected_change = tests[i].should_modify;

2128

2129

test = xmalloc (1 + strlen (tests[i].test) + 1);

2130

sprintf (test, "/%s", tests[i].test);

2131

2132

expected_result = xmalloc (1 + strlen (tests[i].result) + 1);

2133

sprintf (expected_result, "/%s", tests[i].result);

2134

2135

run_test (test, expected_result, expected_change);

2136

2137

xfree (test);

2138

xfree (expected_result);

2139

}

2140

2015

}

2141

2016

#endif

Older »