1
1
/* Support for Robot Exclusion Standard (RES).
2
Copyright (C) 2001, 2006, 2007, 2008 Free Software Foundation, Inc.
2
Copyright (C) 2001, 2006, 2007, 2008, 2009 Free Software Foundation,
4
5
This file is part of Wget.
267
265
lineend to a location preceding the first comment. Real line
268
266
ending remains in lineend_real. */
269
267
for (lineend = p; lineend < lineend_real; lineend++)
270
if ((lineend == p || ISSPACE (*(lineend - 1)))
268
if ((lineend == p || c_isspace (*(lineend - 1)))
271
269
&& *lineend == '#')
274
272
/* Ignore trailing whitespace in the same way. */
275
while (lineend > p && ISSPACE (*(lineend - 1)))
273
while (lineend > p && c_isspace (*(lineend - 1)))
278
276
assert (!EOL (p));
281
while (!EOL (p) && (ISALNUM (*p) || *p == '-'))
279
while (!EOL (p) && (c_isalnum (*p) || *p == '-'))
416
414
advance the pointer. */
418
416
#define DECODE_MAYBE(c, ptr) do { \
419
if (c == '%' && ISXDIGIT (ptr[1]) && ISXDIGIT (ptr[2])) \
417
if (c == '%' && c_isxdigit (ptr[1]) && c_isxdigit (ptr[2])) \
421
419
char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]); \
422
420
if (decoded != '/') \
466
464
if (matches (specs->paths[i].path, path))
468
466
bool allowedp = specs->paths[i].allowedp;
469
DEBUGP (("%s path %s because of rule `%s'.\n",
467
DEBUGP (("%s path %s because of rule %s.\n",
470
468
allowedp ? "Allowing" : "Rejecting",
471
path, specs->paths[i].path));
469
path, quote (specs->paths[i].path)));
535
533
Return true if robots were retrieved OK, false otherwise. */
538
res_retrieve_file (const char *url, char **file)
536
res_retrieve_file (const char *url, char **file, struct iri *iri)
538
struct iri *i = iri_new ();
541
540
char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
542
541
int saved_ts_val = opt.timestamping;
543
int saved_sp_val = opt.spider;
542
int saved_sp_val = opt.spider, url_err;
543
struct url * url_parsed;
545
/* Copy server URI encoding for a possible IDNA transformation, no need to
546
encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
547
set_uri_encoding (i, iri->uri_encoding, false);
548
i->utf8_encode = false;
545
550
logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
547
552
opt.timestamping = false;
548
553
opt.spider = false;
549
err = retrieve_url (robots_url, file, NULL, NULL, NULL, false);
555
url_parsed = url_parse (robots_url, &url_err, iri, true);
558
char *error = url_error (robots_url, url_err);
559
logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, error);
565
err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL,
567
url_free(url_parsed);
550
570
opt.timestamping = saved_ts_val;
551
opt.spider = saved_sp_val;
571
opt.spider = saved_sp_val;
552
572
xfree (robots_url);
554
575
if (err != RETROK && *file != NULL)
605
626
{ "http://www.yoyodyne.com/somepath/", false },
606
627
{ "http://www.yoyodyne.com/somepath/robots.txt", false },
609
for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
630
for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
611
mu_assert ("test_is_robots_txt_url: wrong result",
632
mu_assert ("test_is_robots_txt_url: wrong result",
612
633
is_robots_txt_url (test_array[i].url) == test_array[i].expected_result);