1
# -*- Mode:Python; indent-tabs-mode:nil; tab-width:4 -*-
3
"""Parse (absolute and relative) URLs.
5
See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
9
__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
10
"urlsplit", "urlunsplit"]
12
# A classification of schemes ('' means apply by default)
13
uses_relative = ['ftp', 'ftps', 'http', 'gopher', 'nntp',
14
'wais', 'file', 'https', 'shttp', 'mms',
15
'prospero', 'rtsp', 'rtspu', '', 'sftp', 'imap', 'imaps']
16
uses_netloc = ['ftp', 'ftps', 'http', 'gopher', 'nntp', 'telnet',
17
'wais', 'file', 'mms', 'https', 'shttp',
18
'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
19
'svn', 'svn+ssh', 'sftp', 'imap', 'imaps']
20
non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
21
'telnet', 'wais', 'snews', 'sip', 'sips', 'imap', 'imaps']
22
uses_params = ['ftp', 'ftps', 'hdl', 'prospero', 'http',
23
'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
24
'mms', '', 'sftp', 'imap', 'imaps']
25
uses_query = ['http', 'wais', 'https', 'shttp', 'mms',
26
'gopher', 'rtsp', 'rtspu', 'sip', 'sips', 'imap', 'imaps', '']
27
uses_fragment = ['ftp', 'ftps', 'hdl', 'http', 'gopher', 'news',
28
'nntp', 'wais', 'https', 'shttp', 'snews',
29
'file', 'prospero', '']
31
# Characters valid in scheme names
32
scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
33
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
41
"""Clear the parse cache."""
46
def _rsplit(str, delim, numsplit):
47
parts = string.split(str, delim)
48
if len(parts) <= numsplit + 1:
51
left = string.join(parts[0:-numsplit], delim)
52
right = string.join(parts[len(parts)-numsplit:], delim)
55
class BaseResult(tuple):
56
"""Base class for the parsed result objects.
58
This provides the attributes shared by the two derived result
59
objects as read-only properties. The derived classes are
60
responsible for checking the right number of arguments were
61
supplied to the constructor.
67
# Attributes that access the basic components of the URL:
71
scheme = property(get_scheme)
75
netloc = property(get_netloc)
79
path = property(get_path)
83
query = property(get_query)
85
def get_fragment(self):
87
fragment = property(get_fragment)
89
# Additional attributes that provide access to parsed-out portions
92
def get_username(self):
95
userinfo = _rsplit(netloc, "@", 1)[0]
97
userinfo = userinfo.split(":", 1)[0]
100
username = property(get_username)
102
def get_password(self):
105
userinfo = _rsplit(netloc, "@", 1)[0]
107
return userinfo.split(":", 1)[1]
109
password = property(get_password)
111
def get_hostname(self):
112
netloc = self.netloc.split('@')[-1]
113
if '[' in netloc and ']' in netloc:
114
return netloc.split(']')[0][1:].lower()
116
return netloc.split(':')[0].lower()
120
return netloc.lower()
121
hostname = property(get_hostname)
124
netloc = self.netloc.split('@')[-1].split(']')[-1]
126
port = netloc.split(":", 1)[1]
129
port = property(get_port)
132
class SplitResult(BaseResult):
136
def __new__(cls, scheme, netloc, path, query, fragment):
137
return BaseResult.__new__(
138
cls, (scheme, netloc, path, query, fragment))
141
return urlunsplit(self)
144
class ParseResult(BaseResult):
148
def __new__(cls, scheme, netloc, path, params, query, fragment):
149
return BaseResult.__new__(
150
cls, (scheme, netloc, path, params, query, fragment))
152
def get_params(self):
154
params = property(get_params)
157
return urlunparse(self)
160
def urlparse(url, scheme='', allow_fragments=True):
161
"""Parse a URL into 6 components:
162
<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
163
Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
164
Note that we don't break the components up in smaller bits
165
(e.g. netloc is a single string) and we don't expand % escapes."""
166
tuple = urlsplit(url, scheme, allow_fragments)
167
scheme, netloc, url, query, fragment = tuple
168
if scheme in uses_params and ';' in url:
169
url, params = _splitparams(url)
172
return ParseResult(scheme, netloc, url, params, query, fragment)
174
def _splitparams(url):
176
i = url.find(';', url.rfind('/'))
181
return url[:i], url[i+1:]
183
def _splitnetloc(url, start=0):
184
for c in '/?#': # the order is important!
185
delim = url.find(c, start)
190
return url[start:delim], url[delim:]
192
def urlsplit(url, scheme='', allow_fragments=True):
193
"""Parse a URL into 5 components:
194
<scheme>://<netloc>/<path>?<query>#<fragment>
195
Return a 5-tuple: (scheme, netloc, path, query, fragment).
196
Note that we don't break the components up in smaller bits
197
(e.g. netloc is a single string) and we don't expand % escapes."""
198
allow_fragments = bool(allow_fragments)
199
key = url, scheme, allow_fragments
200
cached = _parse_cache.get(key, None)
203
if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
205
netloc = query = fragment = ''
208
if url[:i] == 'http': # optimize the common case
209
scheme = url[:i].lower()
212
netloc, url = _splitnetloc(url, 2)
213
if allow_fragments and '#' in url:
214
url, fragment = url.split('#', 1)
216
url, query = url.split('?', 1)
217
v = SplitResult(scheme, netloc, url, query, fragment)
218
_parse_cache[key] = v
221
if c not in scheme_chars:
224
scheme, url = url[:i].lower(), url[i+1:]
225
if scheme in uses_netloc and url[:2] == '//':
226
netloc, url = _splitnetloc(url, 2)
227
if allow_fragments and scheme in uses_fragment and '#' in url:
228
url, fragment = url.split('#', 1)
229
if scheme in uses_query and '?' in url:
230
url, query = url.split('?', 1)
231
v = SplitResult(scheme, netloc, url, query, fragment)
232
_parse_cache[key] = v
235
def urlunparse((scheme, netloc, url, params, query, fragment)):
236
"""Put a parsed URL back together again. This may result in a
237
slightly different, but equivalent URL, if the URL that was parsed
238
originally had redundant delimiters, e.g. a ? with an empty query
239
(the draft states that these are equivalent)."""
241
url = "%s;%s" % (url, params)
242
return urlunsplit((scheme, netloc, url, query, fragment))
244
def urlunsplit((scheme, netloc, url, query, fragment)):
245
if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
246
if url and url[:1] != '/': url = '/' + url
247
url = '//' + (netloc or '') + url
249
url = scheme + ':' + url
251
url = url + '?' + query
253
url = url + '#' + fragment
256
def urljoin(base, url, allow_fragments=True):
257
"""Join a base URL and a possibly relative URL to form an absolute
258
interpretation of the latter."""
263
bscheme, bnetloc, bpath, bparams, bquery, bfragment = urlparse(base, '', allow_fragments) #@UnusedVariable
264
scheme, netloc, path, params, query, fragment = urlparse(url, bscheme, allow_fragments)
265
if scheme != bscheme or scheme not in uses_relative:
267
if scheme in uses_netloc:
269
return urlunparse((scheme, netloc, path,
270
params, query, fragment))
273
return urlunparse((scheme, netloc, path,
274
params, query, fragment))
275
if not (path or params or query):
276
return urlunparse((scheme, netloc, bpath,
277
bparams, bquery, fragment))
278
segments = bpath.split('/')[:-1] + path.split('/')
279
# XXX The stuff below is bogus in various ways...
280
if segments[-1] == '.':
282
while '.' in segments:
286
n = len(segments) - 1
288
if (segments[i] == '..'
289
and segments[i-1] not in ('', '..')):
290
del segments[i-1:i+1]
295
if segments == ['', '..']:
297
elif len(segments) >= 2 and segments[-1] == '..':
299
return urlunparse((scheme, netloc, '/'.join(segments),
300
params, query, fragment))
303
"""Removes any existing fragment from URL.
305
Returns a tuple of the defragmented URL and the fragment. If
306
the URL contained no fragments, the second element is the
310
s, n, p, a, q, frag = urlparse(url)
311
defrag = urlunparse((s, n, p, a, q, ''))
321
http:g = <URL:http://a/b/c/g>
322
http: = <URL:http://a/b/c/d>
323
g = <URL:http://a/b/c/g>
324
./g = <URL:http://a/b/c/g>
325
g/ = <URL:http://a/b/c/g/>
326
/g = <URL:http://a/g>
328
?y = <URL:http://a/b/c/d?y>
329
g?y = <URL:http://a/b/c/g?y>
330
g?y/./x = <URL:http://a/b/c/g?y/./x>
331
. = <URL:http://a/b/c/>
332
./ = <URL:http://a/b/c/>
333
.. = <URL:http://a/b/>
334
../ = <URL:http://a/b/>
335
../g = <URL:http://a/b/g>
336
../.. = <URL:http://a/>
337
../../g = <URL:http://a/g>
338
../../../g = <URL:http://a/../g>
339
./../g = <URL:http://a/b/g>
340
./g/. = <URL:http://a/b/c/g/>
341
/./g = <URL:http://a/./g>
342
g/./h = <URL:http://a/b/c/g/h>
343
g/../h = <URL:http://a/b/c/h>
344
http:g = <URL:http://a/b/c/g>
345
http: = <URL:http://a/b/c/d>
346
http:?y = <URL:http://a/b/c/d?y>
347
http:g?y = <URL:http://a/b/c/g?y>
348
http:g?y/./x = <URL:http://a/b/c/g?y/./x>
362
from cStringIO import StringIO
364
from StringIO import StringIO
365
fp = StringIO(test_input)
373
parts = urlparse(url)
374
print '%-10s : %s' % (url, parts)
375
abs = urljoin(base, url)
378
wrapped = '<URL:%s>' % abs
379
print '%-10s = %s' % (url, wrapped)
380
if len(words) == 3 and words[1] == '=':
381
if wrapped != words[2]:
382
print 'EXPECTED', words[2], '!!!!!!!!!!'
384
if __name__ == '__main__':