1
# Copyright 2008 Canonical Ltd.
3
# This file is part of launchpadlib.
5
# launchpadlib is free software: you can redistribute it and/or modify
6
# it under the terms of the GNU Lesser General Public License as
7
# published by the Free Software Foundation, either version 3 of the
8
# License, or (at your option) any later version.
10
# launchpadlib is distributed in the hope that it will be useful, but
11
# WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
# Lesser General Public License for more details.
15
# You should have received a copy of the GNU Lesser General Public
16
# License along with launchpadlib. If not, see
17
# <http://www.gnu.org/licenses/>.
19
"""Functions for working with generic syntax URIs."""
31
# Default port numbers for different URI schemes
32
# The registered URI schemes comes from
33
# http://www.iana.org/assignments/uri-schemes.html
34
# The default ports come from the relevant RFCs
68
'xmlrpc.beeps': '602',
76
# Common but unregistered schemes
86
# Regular expressions adapted from the ABNF in the RFC
88
scheme_re = r"(?P<scheme>[a-z][-a-z0-9+.]*)"
90
userinfo_re = r"(?P<userinfo>(?:[-a-z0-9._~!$&\'()*+,;=:]|%[0-9a-f]{2})*)"
91
# The following regular expression will match some IP address style
92
# host names that the RFC would not (e.g. leading zeros on the
93
# components), but is signficantly simpler.
94
host_re = (r"(?P<host>[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}|"
95
r"(?:[-a-z0-9._~!$&\'()*+,;=]|%[0-9a-f]{2})*|"
97
port_re = r"(?P<port>[0-9]*)"
99
authority_re = r"(?P<authority>(?:%s@)?%s(?::%s)?)" % (
100
userinfo_re, host_re, port_re)
102
path_abempty_re = r"(?:/(?:[-a-z0-9._~!$&\'()*+,;=:@]|%[0-9a-f]{2})*)*"
103
path_noscheme_re = (r"(?:[-a-z0-9._~!$&\'()*+,;=@]|%[0-9a-f]{2})+"
104
r"(?:/(?:[-a-z0-9._~!$&\'()*+,;=:@]|%[0-9a-f]{2})*)*")
105
path_rootless_re = (r"(?:[-a-z0-9._~!$&\'()*+,;=:@]|%[0-9a-f]{2})+"
106
r"(?:/(?:[-a-z0-9._~!$&\'()*+,;=:@]|%[0-9a-f]{2})*)*")
107
path_absolute_re = r"/(?:%s)?" % path_rootless_re
110
hier_part_re = r"(?P<hierpart>//%s%s|%s|%s|%s)" % (
111
authority_re, path_abempty_re, path_absolute_re, path_rootless_re,
114
relative_part_re = r"(?P<relativepart>//%s%s|%s|%s|%s)" % (
115
authority_re, path_abempty_re, path_absolute_re, path_noscheme_re,
118
# Additionally we also permit square braces in the query portion to
119
# accomodate real-world URIs.
120
query_re = r"(?P<query>(?:[-a-z0-9._~!$&\'()*+,;=:@/?\[\]]|%[0-9a-f]{2})*)"
121
fragment_re = r"(?P<fragment>(?:[-a-z0-9._~!$&\'()*+,;=:@/?]|%[0-9a-f]{2})*)"
123
uri_re = r"%s:%s(?:\?%s)?(?:#%s)?$" % (
124
scheme_re, hier_part_re, query_re, fragment_re)
126
relative_ref_re = r"%s(?:\?%s)?(?:#%s)?$" % (
127
relative_part_re, query_re, fragment_re)
129
uri_pat = re.compile(uri_re, re.IGNORECASE)
130
relative_ref_pat = re.compile(relative_ref_re, re.IGNORECASE)
133
def merge(basepath, relpath, has_authority):
134
"""Merge two URI path components into a single path component.
136
Follows rules specified in Section 5.2.3 of RFC 3986.
138
The algorithm in the RFC treats the empty basepath edge case
139
differently for URIs with and without an authority section, which
140
is why the third argument is necessary.
142
if has_authority and basepath == '':
144
slash = basepath.rfind('/')
145
return basepath[:slash+1] + relpath
148
def remove_dot_segments(path):
149
"""Remove '.' and '..' segments from a URI path.
151
Follows the rules specified in Section 5.2.4 of RFC 3986.
155
if path.startswith('../'):
157
elif path.startswith('./'):
159
elif path.startswith('/./') or path == '/.':
160
path = '/' + path[3:]
161
elif path.startswith('/../') or path == '/..':
162
path = '/' + path[4:]
165
elif path in ['.', '..']:
168
if path.startswith('/'):
169
slash = path.find('/', 1)
171
slash = path.find('/')
174
output.append(path[:slash])
176
return ''.join(output)
179
def normalise_unreserved(string):
180
"""Return a version of 's' where no unreserved characters are encoded.
182
Unreserved characters are defined in Section 2.3 of RFC 3986.
184
Percent encoded sequences are normalised to upper case.
186
result = string.split('%')
187
unreserved = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
188
'abcdefghijklmnopqrstuvwxyz'
190
for index, item in enumerate(result):
194
ch = int(item[:2], 16)
197
if chr(ch) in unreserved:
198
result[index] = chr(ch) + item[2:]
200
result[index] = '%%%02X%s' % (ch, item[2:])
201
return ''.join(result)
204
class InvalidURIError(Exception):
209
"""A class that represents a URI.
211
This class can represent arbitrary URIs that conform to the
212
generic syntax described in RFC 3986.
215
def __init__(self, uri=None, scheme=None, userinfo=None, host=None,
216
port=None, path=None, query=None, fragment=None):
217
"""Create a URI instance.
219
Can be called with either a string URI or the component parts
220
of the URI as keyword arguments.
222
In either case, all arguments are expected to be appropriately
225
assert (uri is not None and scheme is None and userinfo is None and
226
host is None and port is None and path is None and
227
query is None and fragment is None) or uri is None, (
228
"URI() must be called with a single string argument or "
229
"with URI components given as keyword arguments.")
232
if isinstance(uri, unicode):
234
uri = uri.encode('ASCII')
235
except UnicodeEncodeError:
236
raise InvalidURIError(
237
'URIs must consist of ASCII characters')
238
match = uri_pat.match(uri)
240
raise InvalidURIError('"%s" is not a valid URI' % uri)
241
self.scheme = match.group('scheme')
242
self.userinfo = match.group('userinfo')
243
self.host = match.group('host')
244
self.port = match.group('port')
245
hierpart = match.group('hierpart')
246
authority = match.group('authority')
247
if authority is None:
250
# Skip past the //authority part
251
self.path = hierpart[2+len(authority):]
252
self.query = match.group('query')
253
self.fragment = match.group('fragment')
256
raise InvalidURIError('URIs must have a scheme')
257
if host is None and (userinfo is not None or port is not None):
258
raise InvalidURIError(
259
'host must be given if userinfo or port are')
261
raise InvalidURIError('URIs must have a path')
263
self.userinfo = userinfo
268
self.fragment = fragment
272
if (self.scheme in ['http', 'https', 'ftp', 'gopher', 'telnet',
273
'imap', 'mms', 'rtsp', 'svn', 'svn+ssh',
274
'bzr', 'bzr+http', 'bzr+ssh'] and
276
raise InvalidURIError('%s URIs must have a host name' %
279
def _normalise(self):
280
"""Perform normalisation of URI components."""
281
self.scheme = self.scheme.lower()
283
if self.userinfo is not None:
284
self.userinfo = normalise_unreserved(self.userinfo)
285
if self.host is not None:
286
self.host = normalise_unreserved(self.host.lower())
289
elif self.port is not None:
290
if self.port == _default_port.get(self.scheme):
292
if self.host is not None and self.path == '':
294
self.path = normalise_unreserved(remove_dot_segments(self.path))
296
if self.query is not None:
297
self.query = normalise_unreserved(self.query)
298
if self.fragment is not None:
299
self.fragment = normalise_unreserved(self.fragment)
303
"""The authority part of the URI"""
304
if self.host is None:
306
authority = self.host
307
if self.userinfo is not None:
308
authority = '%s@%s' % (self.userinfo, authority)
309
if self.port is not None:
310
authority = '%s:%s' % (authority, self.port)
315
"""The hierarchical part of the URI"""
316
authority = self.authority
317
if authority is None:
320
return '//%s%s' % (authority, self.path)
323
uri = '%s:%s' % (self.scheme, self.hier_part)
324
if self.query is not None:
325
uri += '?%s' % self.query
326
if self.fragment is not None:
327
uri += '#%s' % self.fragment
331
return '%s(%r)' % (self.__class__.__name__, str(self))
333
def __eq__(self, other):
334
if isinstance(other, self.__class__):
335
return (self.scheme == other.scheme and
336
self.authority == other.authority and
337
self.path == other.path and
338
self.query == other.query and
339
self.fragment == other.fragment)
341
return NotImplemented
343
def __ne__(self, other):
344
equal = self.__eq__(other)
345
if equal == NotImplemented:
346
return NotImplemented
350
def replace(self, **parts):
351
"""Replace one or more parts of the URI, returning the result."""
356
userinfo=self.userinfo,
361
fragment=self.fragment)
362
baseparts.update(parts)
363
return self.__class__(**baseparts)
365
def resolve(self, reference):
366
"""Resolve the given URI reference relative to this URI.
368
Uses the rules from Section 5.2 of RFC 3986 to resolve the new
371
# If the reference is a full URI, then return it as is.
373
return self.__class__(reference)
374
except InvalidURIError:
377
match = relative_ref_pat.match(reference)
379
raise InvalidURIError("Invalid relative reference")
381
parts = dict(scheme=self.scheme)
382
authority = match.group('authority')
383
if authority is not None:
384
parts['userinfo'] = match.group('userinfo')
385
parts['host'] = match.group('host')
386
parts['port'] = match.group('port')
387
# Skip over the //authority part
388
parts['path'] = remove_dot_segments(
389
match.group('relativepart')[2+len(authority):])
390
parts['query'] = match.group('query')
392
path = match.group('relativepart')
393
query = match.group('query')
395
parts['path'] = self.path
396
if query is not None:
397
parts['query'] = query
399
parts['query'] = self.query
401
if path.startswith('/'):
402
parts['path'] = remove_dot_segments(path)
404
parts['path'] = merge(self.path, path,
405
has_authority=self.host is not None)
406
parts['path'] = remove_dot_segments(parts['path'])
407
parts['query'] = query
408
parts['userinfo'] = self.userinfo
409
parts['host'] = self.host
410
parts['port'] = self.port
411
parts['fragment'] = match.group('fragment')
413
return self.__class__(**parts)
415
def append(self, path):
416
"""Append the given path to this URI.
418
The path must not start with a slash, but a slash is added to
419
base URI (before appending the path), in case it doesn't end
422
assert not path.startswith('/')
423
return self.ensureSlash().resolve(path)
425
def contains(self, other):
426
"""Returns True if the URI 'other' is contained by this one."""
427
if (self.scheme != other.scheme or
428
self.authority != other.authority):
430
if self.path == other.path:
433
if not basepath.endswith('/'):
435
otherpath = other.path
436
if not otherpath.endswith('/'):
438
return otherpath.startswith(basepath)
440
def underDomain(self, domain):
441
"""Return True if the given domain name a parent of the URL's host."""
444
our_segments = self.host.split('.')
445
domain_segments = domain.split('.')
446
return our_segments[-len(domain_segments):] == domain_segments
448
def ensureSlash(self):
449
"""Return a URI with the path normalised to end with a slash."""
450
if self.path.endswith('/'):
453
return self.replace(path=self.path + '/')
455
def ensureNoSlash(self):
456
"""Return a URI with the path normalised to not end with a slash."""
457
if self.path.endswith('/'):
458
return self.replace(path=self.path.rstrip('/'))
463
# Regular expression for finding URIs in a body of text:
465
# From RFC 3986 ABNF for URIs:
467
# URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
468
# hier-part = "//" authority path-abempty
473
# authority = [ userinfo "@" ] host [ ":" port ]
474
# userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
475
# host = IP-literal / IPv4address / reg-name
476
# reg-name = *( unreserved / pct-encoded / sub-delims )
479
# path-abempty = *( "/" segment )
480
# path-absolute = "/" [ segment-nz *( "/" segment ) ]
481
# path-rootless = segment-nz *( "/" segment )
482
# path-empty = 0<pchar>
485
# segment-nz = 1*pchar
486
# pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
488
# query = *( pchar / "/" / "?" )
489
# fragment = *( pchar / "/" / "?" )
491
# unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
492
# pct-encoded = "%" HEXDIG HEXDIG
493
# sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
494
# / "*" / "+" / "," / ";" / "="
496
# We only match a set of known scheme names. We don't handle
499
# We will simplify "unreserved / pct-encoded / sub-delims" as the
500
# following regular expression:
501
# [-a-zA-Z0-9._~%!$&'()*+,;=]
503
# We also require that the path-rootless form not begin with a
504
# colon to avoid matching strings like "http::foo" (to avoid bug
507
# The path-empty pattern is not matched either, due to false
510
# Some allowed URI punctuation characters will be trimmed if they
511
# appear at the end of the URI since they may be incidental in the
514
# apport has at one time produced query strings containing sqaure
515
# braces (that are not percent-encoded). In RFC 2986 they seem to be
516
# allowed by section 2.2 "Reserved Characters", yet section 3.4
517
# "Query" appears to provide a strict definition of the query string
518
# that would forbid square braces. Either way, links with
519
# non-percent-encoded square braces are being used on Launchpad so
520
# it's probably best to accomodate them.
522
possible_uri_re = r'''
524
(?:about|gopher|http|https|sftp|news|ftp|mailto|file|irc|jabber|xmpp)
528
# "//" authority path-abempty
541
(?: / [%(unreserved)s:@]* )*
545
(?: [%(unreserved)s:@]+
546
(?: / [%(unreserved)s:@]* )* )?
551
(?: / [%(unreserved)s:@]* )*
556
[%(unreserved)s:@/\?\[\]]*
560
[%(unreserved)s:@/\?]*
562
''' % {'unreserved': "-a-zA-Z0-9._~%!$&'()*+,;="}
564
possible_uri_pat = re.compile(possible_uri_re, re.IGNORECASE | re.VERBOSE)
565
uri_trailers_pat = re.compile(r'([,.?:);>]+)$')
567
def find_uris_in_text(text):
568
"""Scan a block of text for URIs, and yield the ones found."""
569
for match in possible_uri_pat.finditer(text):
570
uri_string = match.group()
571
# remove characters from end of URI that are not likely to be
573
uri_string = uri_trailers_pat.sub('', uri_string)
575
uri = URI(uri_string)
576
except InvalidURIError: