2
* (c) 2002 Mikulas Patocka
3
* This file is part of the Links project, released under GPL.
5
* Modified by Karl Dahlke for integration with edbrowse.
15
bool need_slash_after_host;
18
"file", 0, true, true, false}, {
19
"http", 80, false, true, true}, {
20
"https", 443, false, true, true}, {
21
"proxy", 3128, false, true, true}, {
22
"ftp", 21, false, true, true}, {
23
"rtsp", 554, false, true, true}, {
24
"pnm", 7070, false, true, true}, {
25
"finger", 79, false, true, true}, {
26
"smb", 139, false, true, true}, {
27
"mailto", 0, false, false, false}, {
28
"telnet", 23, false, false, false}, {
29
"tn3270", 0, false, false, false}, {
30
"javascript", 0, true, false, false}, {
34
static bool free_syntax;
37
protocolByName(const char *p, int l)
40
for(i = 0; protocols[i].prot; i++)
41
if(memEqualCI(protocols[i].prot, p, l))
44
} /* protocolByName */
46
/* Unpercent the host component of a url. Christ what a pain! */
48
unpercentURL(char *url)
55
if(c == '%' && isxdigit(u[0]) && isxdigit(u[1])) {
56
c = fromHex(u[0], u[1]);
60
c = ' '; /* should never happen */
69
if(w[-2] != ':' && w[-2] != '/')
75
/* Decide if it looks like a web url. */
77
httpDefault(const char *url)
79
static const char *const domainSuffix[] = {
80
"com", "biz", "info", "net", "org", "gov", "edu", "us", "uk", "au",
81
"ca", "de", "jp", "nz", 0
84
const char *s, *lastdot, *end = url + strcspn(url, "/?#\1");
85
if(end - url > 7 && stringEqual(end - 7, ".browse"))
87
s = strrchr(url, ':');
89
const char *colon = s;
91
while(isdigitByte(*s))
96
/* need at least two embedded dots */
98
for(s = url + 1; s < end - 1; ++s)
99
if(*s == '.' && s[-1] != '.' && s[1] != '.')
103
/* All digits, like an ip address, is ok. */
105
for(s = url; s < end; ++s)
106
if(!isdigitByte(*s) && *s != '.')
111
/* Look for standard domain suffix */
114
for(n = 0; domainSuffix[n]; ++n)
115
if(memEqualCI(lastdot, domainSuffix[n], len) && !domainSuffix[n][len])
117
/* www.anything.xx is ok */
118
if(len == 2 && memEqualCI(url, "www.", 4))
124
parseURL(const char *url, const char **proto, int *prlen, const char **user, int *uslen, const char **pass, int *palen, /* ftp protocol */
125
const char **host, int *holen,
126
const char **portloc, int *port,
127
const char **data, int *dalen, const char **post)
163
/* Find the leading protocol:// */
165
p = strchr(url, ':');
167
/* You have to have something after the colon */
173
while(isspaceByte(*q))
177
a = protocolByName(url, p - url);
184
if(p[1] != '/' || p[2] != '/') {
185
if(protocols[a].need_slashes) {
187
setError("%s:// expected", protocols[a].prot);
190
/* We got one out of two slashes, I'm going to call it good */
196
} else { /* nothing yet */
197
if(p && p - url < 12 && p[1] == '/') {
198
for(q = url; q < p; ++q)
201
if(q == p) { /* some protocol we don't know */
203
memcpy(qprot, url, p - url);
205
setError("unrecognized protocol %s", qprot);
209
if(httpDefault(url)) {
210
static const char http[] = "http://";
223
if(free_syntax = protocols[a].free_syntax) {
231
q = p + strcspn(p, "@?#/\1");
232
if(*q == '@') { /* user:password@host */
233
const char *pp = strchr(p, ':');
234
if(!pp || pp > q) { /* no password */
252
q = p + strcspn(p, ":?#/\1");
257
if(*q == ':') { /* port specified */
259
const char *cc, *pp = q + strcspn(q, "/?#\1");
260
n = strtol(q + 1, (char **)&cc, 10);
261
if(cc != pp || !isdigitByte(q[1])) {
262
setError("invalid :port specifier at the end of the domain");
269
q = pp; /* up to the slash */
272
*port = protocols[a].port;
275
/* Skip past /, but not ? or # */
280
/* post data is handled separately */
281
q = p + strcspn(p, "\1");
287
*post = *q ? q + 1 : NULL;
292
isURL(const char *url)
294
int j = parseURL(url, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
300
/* Helper functions to return pieces of the URL.
301
* Makes a copy, so you can have your 0 on the end.
302
* Return 0 for an error, and "" if that piece is missing. */
305
getProtURL(const char *url)
310
int rc = parseURL(url, &s, &l, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
318
static char hostbuf[400];
320
getHostURL(const char *url)
326
int rc = parseURL(url, 0, 0, 0, 0, 0, 0, &s, &l, 0, 0, 0, 0, 0);
333
if(l >= sizeof (hostbuf)) {
334
setError("domain name too long");
337
memcpy(hostbuf, s, l);
338
if(l && hostbuf[l - 1] == '.')
341
/* domain names must be ascii, with no spaces */
343
for(s = t = hostbuf; (c = *s); ++s) {
347
if(c == '.' && d == '.')
356
getHostPassURL(const char *url)
359
const char *h, *z, *u;
361
int rc = parseURL(url, 0, 0, &u, 0, 0, 0, &h, &hl, 0, 0, 0, 0, 0);
369
z = u, hl += h - u, t += h - u;
370
if(hl >= sizeof (hostbuf)) {
371
setError("domain name too long");
374
memcpy(hostbuf, z, hl);
376
/* domain names must be ascii */
380
} /* getHostPassURL */
383
getUserURL(const char *url)
385
static char buf[MAXUSERPASS];
388
int rc = parseURL(url, 0, 0, &s, &l, 0, 0, 0, 0, 0, 0, 0, 0, 0);
395
if(l >= sizeof (buf)) {
396
setError("user name too long");
405
getPassURL(const char *url)
407
static char buf[MAXUSERPASS];
410
int rc = parseURL(url, 0, 0, 0, 0, &s, &l, 0, 0, 0, 0, 0, 0, 0);
417
if(l >= sizeof (buf)) {
418
setError("password too long");
427
getDataURL(const char *url)
430
int rc = parseURL(url, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, &s, 0, 0);
437
getDirURL(const char *url, const char **start_p, const char **end_p)
439
const char *dir = getDataURL(url);
441
static const char myslash[] = "/";
442
if(!dir || dir == url)
446
if(!strchr("#?\1", *dir)) {
448
errorPrint("1misplaced / in getDirURL(%s)", url);
450
end = strpbrk(dir, "#?\1");
452
end = dir + strlen(dir);
453
while(end > dir && end[-1] != '/')
462
*end_p = myslash + 1;
466
getPortLocURL(const char *url, const char **portloc, int *port)
468
int rc = parseURL(url, 0, 0, 0, 0, 0, 0, 0, 0, portloc, port, 0, 0, 0);
474
} /* getPortLocURL */
477
getPortURL(const char *url)
480
int rc = parseURL(url, 0, 0, 0, 0, 0, 0, 0, 0, 0, &port, 0, 0, 0);
489
isProxyURL(const char *url)
491
return ((url[0] | 0x20) == 'p');
494
/* Don't let a web page fetch itself. */
495
static char *histURL[MAXFETCH];
496
static int histFrom[MAXFETCH];
499
fetchHistory(const char *prev, const char *next)
503
/* zero is a reset */
504
debugPrint(4, "fetch hist %s : %s", prev, next);
506
for(i = 0; i < n_fetch; ++i)
512
if(memEqualCI(prev, "http://", 7))
514
for(i = 0; i < n_fetch; ++i)
515
if(stringEqual(prev, histURL[i])) {
520
if(n_fetch >= MAXFETCH) {
522
("too many fetches from the internet, you may want to disable `redirect html'");
525
/* Have we seen this one before? */
526
if(memEqualCI(next, "http://", 7))
528
for(i = 0; i < n_fetch; ++i)
529
if(stringEqual(next, histURL[i]))
531
if(i == n_fetch) { /* new */
532
histURL[i] = cloneString(next);
537
/* Oops, we've already fetched this page. */
538
while(from >= 0 && from != i)
539
from = histFrom[from];
542
setError("web page indirectly fetches itself, an infinite loop");
555
squashDirectories(char *url)
557
char *dd = (char *)getDataURL(url);
559
if(memEqualCI(url, "javascript:", 11))
565
if(strchr("#?\1", *dd))
569
errorPrint("@misplaced / in %s", url);
570
end = dd + strcspn(dd, "?#\1");
572
s = strstr(dd, "/./");
577
s = strstr(dd, "/../");
586
for(t = s - 1; *t != '/'; --t) ;
590
} /* squashDirectories */
593
resolveURL(const char *base, const char *rel)
595
char *n; /* new url */
601
n = allocMem(strlen(base) + strlen(rel) + 12);
602
debugPrint(5, "resolve(%s|%s)", base, rel);
604
/* # alone means do nothing. */
608
debugPrint(5, "= %s", n);
611
/* We could have changed the base url via the <base> tag,
612
* so this #ref could actually refer to some other web page.
613
* Best to run through standard procedure. */
615
for(q = n; *q && *q != '\1' && *q != '#'; q++) ;
619
if(rel[0] == '?' || rel[0] == '\1') {
621
for(q = n; *q && *q != '\1' && *q != '#' && *q != '?'; q++) ;
625
if(rel[0] == '/' && rel[1] == '/') {
626
if(s = strstr(base, "//")) {
627
strncpy(n, base, s - base);
634
if(parseURL(rel, &s, &l, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) > 0) {
637
/* It didn't have http in front of it before, put it on now. */
639
strcpy(n + l, "://");
643
} /* relative is already a url */
646
s = getDataURL(base);
652
if(s - base >= 7 && stringEqual(s - 7, ".browse"))
654
if(s > base && s[-1] == '/')
656
} else if(!strchr("#?\1", *s))
663
/* This is a relative change, paste it on after the last slash */
665
if(parseURL(base, 0, 0, 0, 0, 0, 0, &p, 0, 0, 0, 0, 0, 0) > 0 && p)
667
for(p = 0; *s; ++s) {
670
if(strchr("#?\1", *s))
686
squashDirectories(n);
690
/* This routine could be, should be, more sophisticated */
692
sameURL(const char *s, const char *t)
696
/* It's ok if one says http and the other implies it. */
697
if(memEqualCI(s, "http://", 7))
699
if(memEqualCI(t, "http://", 7))
701
u = s + strcspn(s, "#");
702
v = t + strcspn(t, "#?\1");
703
if(u - s >= 7 && stringEqual(u - 7, ".browse"))
705
if(v - t >= 7 && stringEqual(v - 7, ".browse"))
710
return !memcmp(s, t, l);
713
/* Find some helpful text to print, in place of an image. */
714
/* Text longer than 80 chars isn't helpful, so we return a static buffer. */
716
altText(const char *base)
722
debugPrint(5, "altText(%s)", base);
725
if(stringEqual(base, "#"))
727
if(memEqualCI(base, "javascript", 10))
732
strncpy(buf, base, sizeof (buf) - 1);
733
spaceCrunch(buf, true, true);
735
if(len && !isalnumByte(buf[len - 1]))
737
while(len && !isalnumByte(buf[0]))
738
strcpy(buf, buf + 1), --len;
740
/* see if it's a phrase/sentence or a pathname/url */
741
/* Do this by counting spaces */
742
for(n = 0, s = buf; *s; ++s)
746
return buf; /* looks like words */
747
/* Ok, now we believe it's a pathname or url */
748
/* Get rid of everything after ? or # */
749
s = strpbrk(buf, "#?\1");
752
/* get rid of common suffix */
753
s = strrchr(buf, '.');
755
/* get rid of trailing .html */
756
static const char *const suffix[] = {
757
"html", "htm", "shtml", "shtm", "php", "asp", "cgi", "rm",
762
n = stringInListCI(suffix, s + 1);
763
if(n >= 0 || s[1] == 0)
766
/* Get rid of everything up to the last slash, leaving the file name */
767
s = strrchr(buf, '/');
771
ss = strrchr(buf, '/');
774
if(ss > buf && ss[-1] == '/')
781
} /* more than ten characters */
783
/* If we don't have enough letters, forget it */
787
for(n = 0, s = buf; *s; ++s)
791
return 0; /* not enough letters */
795
/* get post data ready for a url. */
797
encodePostData(const char *s)
807
post = initString(&l);
815
if(strchr("-._~*()!", c))
817
sprintf(buf, "%%%02X", (uchar) c);
818
stringAndString(&post, &l, buf);
821
stringAndChar(&post, &l, c);
824
} /* encodePostData */
827
dohex(char c, const char **sp)
837
if(!isxdigit(d) || !isxdigit(e))
838
return c; /* should never happen */
841
d = ' '; /* don't allow nulls */
847
decodePostData(const char *data, const char *name, int seqno)
849
const char *s, *n, *t;
850
char *new = 0, *w = 0;
855
errorPrint("@decodePostData(0,0)");
857
for(s = data; *s; s = (*t ? t + 1 : t)) {
862
/* select attribute by number */
865
w = new = allocMem(t - s + 1);
870
while(s < t && (c = *s) != '=') {
874
/* I don't know if this is suppose to be case insensitive all the time,
875
* though there are situations when it must be, as in
876
* mailto:address?Subject=blah-blah */
878
if(!((c ^ *n) & 0xdf))
891
if(s == t) { /* no equals, just a string */
899
++s; /* skip past equals */
905
w = new = allocMem(t - s + 1);
908
/* At this point we have a match */
919
} /* decodePostData */
922
decodeMailURL(const char *url, char **addr_p, char **subj_p, char **body_p)
926
if(memEqualCI(url, "mailto:", 7))
928
s = url + strcspn(url, "/?");
930
*addr_p = pullString1(url, s);
935
s = strchr(url, '?');
940
*subj_p = decodePostData(url, "subject", 0);
942
*body_p = decodePostData(url, "body", 0);
943
} /* decodeMailURL */