1
"""Parse (absolute and relative) URLs.
3
urlparse module is based upon the following RFC specifications.
5
RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
6
and L. Masinter, January 2005.
8
RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter
9
and L.Masinter, December 1999.
11
RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T.
12
Berners-Lee, R. Fielding, and L. Masinter, August 1998.
14
RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
16
RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
19
RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
20
McCahill, December 1994
22
RFC 3986 is considered the current standard and any future changes to
23
urlparse module should conform with it. The urlparse module is
24
currently not entirely compliant with this RFC due to defacto
25
scenarios for parsing, and for backward compatibility purposes, some
26
parsing quirks from older RFCs are retained. The testcases in
27
test_urlparse.py provides a good indicator of parsing behavior.
31
__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
32
"urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
34
# A classification of schemes ('' means apply by default)
35
uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap',
36
'wais', 'file', 'https', 'shttp', 'mms',
37
'prospero', 'rtsp', 'rtspu', '', 'sftp']
38
uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet',
39
'imap', 'wais', 'file', 'mms', 'https', 'shttp',
40
'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
41
'svn', 'svn+ssh', 'sftp','nfs','git', 'git+ssh']
42
non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',
43
'telnet', 'wais', 'imap', 'snews', 'sip', 'sips']
44
uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap',
45
'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',
47
uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
48
'gopher', 'rtsp', 'rtspu', 'sip', 'sips', '']
49
uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news',
50
'nntp', 'wais', 'https', 'shttp', 'snews',
51
'file', 'prospero', '']
53
# Characters valid in scheme names
54
scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
55
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
63
"""Clear the parse cache."""
67
class ResultMixin(object):
68
"""Shared methods for the parsed result objects."""
74
userinfo = netloc.rsplit("@", 1)[0]
76
userinfo = userinfo.split(":", 1)[0]
84
userinfo = netloc.rsplit("@", 1)[0]
86
return userinfo.split(":", 1)[1]
91
netloc = self.netloc.split('@')[-1]
92
if '[' in netloc and ']' in netloc:
93
return netloc.split(']')[0][1:].lower()
95
return netloc.split(':')[0].lower()
103
netloc = self.netloc.split('@')[-1].split(']')[-1]
105
port = netloc.split(':')[1]
110
from collections import namedtuple
112
class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
117
return urlunsplit(self)
120
class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
125
return urlunparse(self)
128
def urlparse(url, scheme='', allow_fragments=True):
129
"""Parse a URL into 6 components:
130
<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
131
Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
132
Note that we don't break the components up in smaller bits
133
(e.g. netloc is a single string) and we don't expand % escapes."""
134
tuple = urlsplit(url, scheme, allow_fragments)
135
scheme, netloc, url, query, fragment = tuple
136
if scheme in uses_params and ';' in url:
137
url, params = _splitparams(url)
140
return ParseResult(scheme, netloc, url, params, query, fragment)
142
def _splitparams(url):
144
i = url.find(';', url.rfind('/'))
149
return url[:i], url[i+1:]
151
def _splitnetloc(url, start=0):
152
delim = len(url) # position of end of domain part of url, default is end
153
for c in '/?#': # look for delimiters; the order is NOT important
154
wdelim = url.find(c, start) # find first of this delim
155
if wdelim >= 0: # if found
156
delim = min(delim, wdelim) # use earliest delim position
157
return url[start:delim], url[delim:] # return (domain, rest)
159
def urlsplit(url, scheme='', allow_fragments=True):
160
"""Parse a URL into 5 components:
161
<scheme>://<netloc>/<path>?<query>#<fragment>
162
Return a 5-tuple: (scheme, netloc, path, query, fragment).
163
Note that we don't break the components up in smaller bits
164
(e.g. netloc is a single string) and we don't expand % escapes."""
165
allow_fragments = bool(allow_fragments)
166
key = url, scheme, allow_fragments, type(url), type(scheme)
167
cached = _parse_cache.get(key, None)
170
if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth
172
netloc = query = fragment = ''
175
if url[:i] == 'http': # optimize the common case
176
scheme = url[:i].lower()
179
netloc, url = _splitnetloc(url, 2)
180
if (('[' in netloc and ']' not in netloc) or
181
(']' in netloc and '[' not in netloc)):
182
raise ValueError("Invalid IPv6 URL")
183
if allow_fragments and '#' in url:
184
url, fragment = url.split('#', 1)
186
url, query = url.split('?', 1)
187
v = SplitResult(scheme, netloc, url, query, fragment)
188
_parse_cache[key] = v
191
if c not in scheme_chars:
195
# make sure "url" is not actually a port number (in which case
196
# "scheme" is really part of the path
197
_testportnum = int(url[i+1:])
199
scheme, url = url[:i].lower(), url[i+1:]
202
netloc, url = _splitnetloc(url, 2)
203
if (('[' in netloc and ']' not in netloc) or
204
(']' in netloc and '[' not in netloc)):
205
raise ValueError("Invalid IPv6 URL")
206
if allow_fragments and scheme in uses_fragment and '#' in url:
207
url, fragment = url.split('#', 1)
208
if scheme in uses_query and '?' in url:
209
url, query = url.split('?', 1)
210
v = SplitResult(scheme, netloc, url, query, fragment)
211
_parse_cache[key] = v
214
def urlunparse(data):
215
"""Put a parsed URL back together again. This may result in a
216
slightly different, but equivalent URL, if the URL that was parsed
217
originally had redundant delimiters, e.g. a ? with an empty query
218
(the draft states that these are equivalent)."""
219
scheme, netloc, url, params, query, fragment = data
221
url = "%s;%s" % (url, params)
222
return urlunsplit((scheme, netloc, url, query, fragment))
224
def urlunsplit(data):
225
"""Combine the elements of a tuple as returned by urlsplit() into a
226
complete URL as a string. The data argument can be any five-item iterable.
227
This may result in a slightly different, but equivalent URL, if the URL that
228
was parsed originally had unnecessary delimiters (for example, a ? with an
229
empty query; the RFC states that these are equivalent)."""
230
scheme, netloc, url, query, fragment = data
231
if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
232
if url and url[:1] != '/': url = '/' + url
233
url = '//' + (netloc or '') + url
235
url = scheme + ':' + url
237
url = url + '?' + query
239
url = url + '#' + fragment
242
def urljoin(base, url, allow_fragments=True):
243
"""Join a base URL and a possibly relative URL to form an absolute
244
interpretation of the latter."""
249
bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
250
urlparse(base, '', allow_fragments)
251
scheme, netloc, path, params, query, fragment = \
252
urlparse(url, bscheme, allow_fragments)
253
if scheme != bscheme or scheme not in uses_relative:
255
if scheme in uses_netloc:
257
return urlunparse((scheme, netloc, path,
258
params, query, fragment))
261
return urlunparse((scheme, netloc, path,
262
params, query, fragment))
263
if not path and not params:
268
return urlunparse((scheme, netloc, path,
269
params, query, fragment))
270
segments = bpath.split('/')[:-1] + path.split('/')
271
# XXX The stuff below is bogus in various ways...
272
if segments[-1] == '.':
274
while '.' in segments:
278
n = len(segments) - 1
280
if (segments[i] == '..'
281
and segments[i-1] not in ('', '..')):
282
del segments[i-1:i+1]
287
if segments == ['', '..']:
289
elif len(segments) >= 2 and segments[-1] == '..':
291
return urlunparse((scheme, netloc, '/'.join(segments),
292
params, query, fragment))
295
"""Removes any existing fragment from URL.
297
Returns a tuple of the defragmented URL and the fragment. If
298
the URL contained no fragments, the second element is the
302
s, n, p, a, q, frag = urlparse(url)
303
defrag = urlunparse((s, n, p, a, q, ''))
308
# unquote method for parse_qs and parse_qsl
309
# Cannot use directly from urllib as it would create a circular reference
310
# because urllib uses urlparse methods (urljoin). If you update this function,
311
# update it also in urllib. This code duplication does not existin in Python3.
313
_hexdig = '0123456789ABCDEFabcdef'
314
_hextochr = dict((a+b, chr(int(a+b,16)))
315
for a in _hexdig for b in _hexdig)
318
"""unquote('abc%20def') -> 'abc def'."""
326
s += _hextochr[item[:2]] + item[2:]
329
except UnicodeDecodeError:
330
s += unichr(int(item[:2], 16)) + item[2:]
333
def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
334
"""Parse a query given as a string argument.
338
qs: percent-encoded query string to be parsed
340
keep_blank_values: flag indicating whether blank values in
341
percent-encoded queries should be treated as blank strings.
342
A true value indicates that blanks should be retained as
343
blank strings. The default false value indicates that
344
blank values are to be ignored and treated as if they were
347
strict_parsing: flag indicating what to do with parsing errors.
348
If false (the default), errors are silently ignored.
349
If true, errors raise a ValueError exception.
352
for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
354
dict[name].append(value)
359
def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
360
"""Parse a query given as a string argument.
364
qs: percent-encoded query string to be parsed
366
keep_blank_values: flag indicating whether blank values in
367
percent-encoded queries should be treated as blank strings. A
368
true value indicates that blanks should be retained as blank
369
strings. The default false value indicates that blank values
370
are to be ignored and treated as if they were not included.
372
strict_parsing: flag indicating what to do with parsing errors. If
373
false (the default), errors are silently ignored. If true,
374
errors raise a ValueError exception.
376
Returns a list, as G-d intended.
378
pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
380
for name_value in pairs:
381
if not name_value and not strict_parsing:
383
nv = name_value.split('=', 1)
386
raise ValueError, "bad query field: %r" % (name_value,)
387
# Handle case of a control-name with no equal sign
388
if keep_blank_values:
392
if len(nv[1]) or keep_blank_values:
393
name = unquote(nv[0].replace('+', ' '))
394
value = unquote(nv[1].replace('+', ' '))
395
r.append((name, value))