~geser/launchpadlib/toc : revision 38

1

2

3

# This file is part of launchpadlib.

4

#

5

# launchpadlib is free software: you can redistribute it and/or modify

6

# it under the terms of the GNU Lesser General Public License as

7

# published by the Free Software Foundation, either version 3 of the

8

# License, or (at your option) any later version.

9

#

10

# launchpadlib is distributed in the hope that it will be useful, but

11

# WITHOUT ANY WARRANTY; without even the implied warranty of

12

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

13

# Lesser General Public License for more details.

14

#

15

# You should have received a copy of the GNU Lesser General Public

16

# License along with launchpadlib. If not, see

17

# <http://www.gnu.org/licenses/>.

18

19

"""Functions for working with generic syntax URIs."""

20

21

__metaclass__ = type

22

__all__ = [

23

'URI',

24

'InvalidURIError',

25

'find_uris_in_text',

26

'possible_uri_re']

27

28

import re

29

30

31

# Default port numbers for different URI schemes

32

# The registered URI schemes comes from

33

# http://www.iana.org/assignments/uri-schemes.html

34

# The default ports come from the relevant RFCs

35

36

_default_port = {

37

# Official schemes

38

'acap': '674',

39

'dav': '80',

40

'dict': '2628',

41

'dns': '53',

42

'ftp': '21',

43

'go': '1096',

44

'gopher': '70',

45

'h323': '1720',

46

'http': '80',

47

'https': '443',

48

'imap': '143',

49

'ipp': '631',

50

'iris.beep': '702',

51

'ldap': '389',

52

'mtqp': '1038',

53

'mupdate': '3905',

54

'nfs': '2049',

55

'nntp': '119',

56

'pop': '110',

57

'rtsp': '554',

58

'sip': '5060',

59

'sips': '5061',

60

'snmp': '161',

61

'soap.beep': '605',

62

'soap.beeps': '605',

63

'telnet': '23',

64

'tftp': '69',

65

'tip': '3372',

66

'vemmi': '575',

67

'xmlrpc.beep': '602',

68

'xmlrpc.beeps': '602',

69

'z39.50r': '210',

70

'z39.50s': '210',

71

72

# Historical schemes

73

'prospero': '1525',

74

'wais': '210',

75

76

# Common but unregistered schemes

77

'bzr+http': '80',

78

'bzr+ssh': '22',

79

'irc': '6667',

80

'sftp': '22',

81

'ssh': '22',

82

'svn': '3690',

83

'svn+ssh': '22',

84

}

85

86

# Regular expressions adapted from the ABNF in the RFC

87

88

scheme_re = r"(?P<scheme>[a-z][-a-z0-9+.]*)"

89

90

userinfo_re = r"(?P<userinfo>(?:[-a-z0-9._~!$&\'()*+,;=:]|%[0-9a-f]{2})*)"

91

# The following regular expression will match some IP address style

92

# host names that the RFC would not (e.g. leading zeros on the

93

# components), but is signficantly simpler.

94

host_re = (r"(?P<host>[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}|"

95

r"(?:[-a-z0-9._~!$&\'()*+,;=]|%[0-9a-f]{2})*|"

96

r"\[[0-9a-z:.]+\])")

97

port_re = r"(?P<port>[0-9]*)"

98

99

authority_re = r"(?P<authority>(?:%s@)?%s(?::%s)?)" % (

100

userinfo_re, host_re, port_re)

101

102

path_abempty_re = r"(?:/(?:[-a-z0-9._~!$&\'()*+,;=:@]|%[0-9a-f]{2})*)*"

103

path_noscheme_re = (r"(?:[-a-z0-9._~!$&\'()*+,;=@]|%[0-9a-f]{2})+"

104

r"(?:/(?:[-a-z0-9._~!$&\'()*+,;=:@]|%[0-9a-f]{2})*)*")

105

path_rootless_re = (r"(?:[-a-z0-9._~!$&\'()*+,;=:@]|%[0-9a-f]{2})+"

106

r"(?:/(?:[-a-z0-9._~!$&\'()*+,;=:@]|%[0-9a-f]{2})*)*")

107

path_absolute_re = r"/(?:%s)?" % path_rootless_re

108

path_empty_re = r""

109

110

hier_part_re = r"(?P<hierpart>//%s%s|%s|%s|%s)" % (

111

authority_re, path_abempty_re, path_absolute_re, path_rootless_re,

112

path_empty_re)

113

114

relative_part_re = r"(?P<relativepart>//%s%s|%s|%s|%s)" % (

115

authority_re, path_abempty_re, path_absolute_re, path_noscheme_re,

116

path_empty_re)

117

118

# Additionally we also permit square braces in the query portion to

119

# accomodate real-world URIs.

120

query_re = r"(?P<query>(?:[-a-z0-9._~!$&\'()*+,;=:@/?\[\]]|%[0-9a-f]{2})*)"

121

fragment_re = r"(?P<fragment>(?:[-a-z0-9._~!$&\'()*+,;=:@/?]|%[0-9a-f]{2})*)"

122

123

uri_re = r"%s:%s(?:\?%s)?(?:#%s)?$" % (

124

scheme_re, hier_part_re, query_re, fragment_re)

125

126

relative_ref_re = r"%s(?:\?%s)?(?:#%s)?$" % (

127

relative_part_re, query_re, fragment_re)

128

129

uri_pat = re.compile(uri_re, re.IGNORECASE)

130

relative_ref_pat = re.compile(relative_ref_re, re.IGNORECASE)

131

132

133

def merge(basepath, relpath, has_authority):

134

"""Merge two URI path components into a single path component.

135

136

Follows rules specified in Section 5.2.3 of RFC 3986.

137

138

The algorithm in the RFC treats the empty basepath edge case

139

differently for URIs with and without an authority section, which

140

is why the third argument is necessary.

141

"""

142

if has_authority and basepath == '':

143

return '/' + relpath

144

slash = basepath.rfind('/')

145

return basepath[:slash+1] + relpath

146

147

148

def remove_dot_segments(path):

149

"""Remove '.' and '..' segments from a URI path.

150

151

Follows the rules specified in Section 5.2.4 of RFC 3986.

152

"""

153

output = []

154

while path:

155

if path.startswith('../'):

156

path = path[3:]

157

elif path.startswith('./'):

158

path = path[2:]

159

elif path.startswith('/./') or path == '/.':

160

path = '/' + path[3:]

161

elif path.startswith('/../') or path == '/..':

162

path = '/' + path[4:]

163

if len(output) > 0:

164

del output[-1]

165

elif path in ['.', '..']:

166

path = ''

167

else:

168

if path.startswith('/'):

169

slash = path.find('/', 1)

170

else:

171

slash = path.find('/')

172

if slash < 0:

173

slash = len(path)

174

output.append(path[:slash])

175

path = path[slash:]

176

return ''.join(output)

177

178

179

def normalise_unreserved(string):

180

"""Return a version of 's' where no unreserved characters are encoded.

181

182

Unreserved characters are defined in Section 2.3 of RFC 3986.

183

184

Percent encoded sequences are normalised to upper case.

185

"""

186

result = string.split('%')

187

unreserved = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'

188

'abcdefghijklmnopqrstuvwxyz'

189

'0123456789-._~')

190

for index, item in enumerate(result):

191

if index == 0:

192

continue

193

try:

194

ch = int(item[:2], 16)

195

except ValueError:

196

continue

197

if chr(ch) in unreserved:

198

result[index] = chr(ch) + item[2:]

199

else:

200

result[index] = '%%%02X%s' % (ch, item[2:])

201

return ''.join(result)

202

203

204

class InvalidURIError(Exception):

205

"""Invalid URI"""

206

207

208

class URI:

209

"""A class that represents a URI.

210

211

This class can represent arbitrary URIs that conform to the

212

generic syntax described in RFC 3986.

213

"""

214

215

def __init__(self, uri=None, scheme=None, userinfo=None, host=None,

216

port=None, path=None, query=None, fragment=None):

217

"""Create a URI instance.

218

219

Can be called with either a string URI or the component parts

220

of the URI as keyword arguments.

221

222

In either case, all arguments are expected to be appropriately

223

URI encoded.

224

"""

225

assert (uri is not None and scheme is None and userinfo is None and

226

host is None and port is None and path is None and

227

query is None and fragment is None) or uri is None, (

228

"URI() must be called with a single string argument or "

229

"with URI components given as keyword arguments.")

230

231

if uri is not None:

232

if isinstance(uri, unicode):

233

try:

234

uri = uri.encode('ASCII')

235

except UnicodeEncodeError:

236

raise InvalidURIError(

237

'URIs must consist of ASCII characters')

238

match = uri_pat.match(uri)

239

if match is None:

240

raise InvalidURIError('"%s" is not a valid URI' % uri)

241

self.scheme = match.group('scheme')

242

self.userinfo = match.group('userinfo')

243

self.host = match.group('host')

244

self.port = match.group('port')

245

hierpart = match.group('hierpart')

246

authority = match.group('authority')

247

if authority is None:

248

self.path = hierpart

249

else:

250

# Skip past the //authority part

251

self.path = hierpart[2+len(authority):]

252

self.query = match.group('query')

253

self.fragment = match.group('fragment')

254

else:

255

if scheme is None:

256

raise InvalidURIError('URIs must have a scheme')

257

if host is None and (userinfo is not None or port is not None):

258

raise InvalidURIError(

259

'host must be given if userinfo or port are')

260

if path is None:

261

raise InvalidURIError('URIs must have a path')

262

self.scheme = scheme

263

self.userinfo = userinfo

264

self.host = host

265

self.port = port

266

self.path = path

267

self.query = query

268

self.fragment = fragment

269

270

self._normalise()

271

272

if (self.scheme in ['http', 'https', 'ftp', 'gopher', 'telnet',

273

'imap', 'mms', 'rtsp', 'svn', 'svn+ssh',

274

'bzr', 'bzr+http', 'bzr+ssh'] and

275

not self.host):

276

raise InvalidURIError('%s URIs must have a host name' %

277

self.scheme)

278

279

def _normalise(self):

280

"""Perform normalisation of URI components."""

281

self.scheme = self.scheme.lower()

282

283

if self.userinfo is not None:

284

self.userinfo = normalise_unreserved(self.userinfo)

285

if self.host is not None:

286

self.host = normalise_unreserved(self.host.lower())

287

if self.port == '':

288

self.port = None

289

elif self.port is not None:

290

if self.port == _default_port.get(self.scheme):

291

self.port = None

292

if self.host is not None and self.path == '':

293

self.path = '/'

294

self.path = normalise_unreserved(remove_dot_segments(self.path))

295

296

if self.query is not None:

297

self.query = normalise_unreserved(self.query)

298

if self.fragment is not None:

299

self.fragment = normalise_unreserved(self.fragment)

300

301

@property

302

def authority(self):

303

"""The authority part of the URI"""

304

if self.host is None:

305

return None

306

authority = self.host

307

if self.userinfo is not None:

308

authority = '%s@%s' % (self.userinfo, authority)

309

if self.port is not None:

310

authority = '%s:%s' % (authority, self.port)

311

return authority

312

313

@property

314

def hier_part(self):

315

"""The hierarchical part of the URI"""

316

authority = self.authority

317

if authority is None:

318

return self.path

319

else:

320

return '//%s%s' % (authority, self.path)

321

322

def __str__(self):

323

uri = '%s:%s' % (self.scheme, self.hier_part)

324

if self.query is not None:

325

uri += '?%s' % self.query

326

if self.fragment is not None:

327

uri += '#%s' % self.fragment

328

return uri

329

330

def __repr__(self):

331

return '%s(%r)' % (self.__class__.__name__, str(self))

332

333

def __eq__(self, other):

334

if isinstance(other, self.__class__):

335

return (self.scheme == other.scheme and

336

self.authority == other.authority and

337

self.path == other.path and

338

self.query == other.query and

339

self.fragment == other.fragment)

340

else:

341

return NotImplemented

342

343

def __ne__(self, other):

344

equal = self.__eq__(other)

345

if equal == NotImplemented:

346

return NotImplemented

347

else:

348

return not equal

349

350

def replace(self, **parts):

351

"""Replace one or more parts of the URI, returning the result."""

352

if not parts:

353

return self

354

baseparts = dict(

355

scheme=self.scheme,

356

userinfo=self.userinfo,

357

host=self.host,

358

port=self.port,

359

path=self.path,

360

query=self.query,

361

fragment=self.fragment)

362

baseparts.update(parts)

363

return self.__class__(**baseparts)

364

365

def resolve(self, reference):

366

"""Resolve the given URI reference relative to this URI.

367

368

Uses the rules from Section 5.2 of RFC 3986 to resolve the new

369

URI.

370

"""

371

# If the reference is a full URI, then return it as is.

372

try:

373

return self.__class__(reference)

374

except InvalidURIError:

375

pass

376

377

match = relative_ref_pat.match(reference)

378

if match is None:

379

raise InvalidURIError("Invalid relative reference")

380

381

parts = dict(scheme=self.scheme)

382

authority = match.group('authority')

383

if authority is not None:

384

parts['userinfo'] = match.group('userinfo')

385

parts['host'] = match.group('host')

386

parts['port'] = match.group('port')

387

# Skip over the //authority part

388

parts['path'] = remove_dot_segments(

389

match.group('relativepart')[2+len(authority):])

390

parts['query'] = match.group('query')

391

else:

392

path = match.group('relativepart')

393

query = match.group('query')

394

if path == '':

395

parts['path'] = self.path

396

if query is not None:

397

parts['query'] = query

398

else:

399

parts['query'] = self.query

400

else:

401

if path.startswith('/'):

402

parts['path'] = remove_dot_segments(path)

403

else:

404

parts['path'] = merge(self.path, path,

405

has_authority=self.host is not None)

406

parts['path'] = remove_dot_segments(parts['path'])

407

parts['query'] = query

408

parts['userinfo'] = self.userinfo

409

parts['host'] = self.host

410

parts['port'] = self.port

411

parts['fragment'] = match.group('fragment')

412

413

return self.__class__(**parts)

414

415

def append(self, path):

416

"""Append the given path to this URI.

417

418

The path must not start with a slash, but a slash is added to

419

base URI (before appending the path), in case it doesn't end

420

with a slash.

421

"""

422

assert not path.startswith('/')

423

return self.ensureSlash().resolve(path)

424

425

def contains(self, other):

426

"""Returns True if the URI 'other' is contained by this one."""

427

if (self.scheme != other.scheme or

428

self.authority != other.authority):

429

return False

430

if self.path == other.path:

431

return True

432

basepath = self.path

433

if not basepath.endswith('/'):

434

basepath += '/'

435

otherpath = other.path

436

if not otherpath.endswith('/'):

437

otherpath += '/'

438

return otherpath.startswith(basepath)

439

440

def underDomain(self, domain):

441

"""Return True if the given domain name a parent of the URL's host."""

442

if len(domain) == 0:

443

return True

444

our_segments = self.host.split('.')

445

domain_segments = domain.split('.')

446

return our_segments[-len(domain_segments):] == domain_segments

447

448

def ensureSlash(self):

449

"""Return a URI with the path normalised to end with a slash."""

450

if self.path.endswith('/'):

451

return self

452

else:

453

return self.replace(path=self.path + '/')

454

455

def ensureNoSlash(self):

456

"""Return a URI with the path normalised to not end with a slash."""

457

if self.path.endswith('/'):

458

return self.replace(path=self.path.rstrip('/'))

459

else:

460

return self

461

462

463

# Regular expression for finding URIs in a body of text:

464

#

465

# From RFC 3986 ABNF for URIs:

466

#

467

# URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]

468

# hier-part = "//" authority path-abempty

469

# / path-absolute

470

# / path-rootless

471

# / path-empty

472

#

473

# authority = [ userinfo "@" ] host [ ":" port ]

474

# userinfo = *( unreserved / pct-encoded / sub-delims / ":" )

475

# host = IP-literal / IPv4address / reg-name

476

# reg-name = *( unreserved / pct-encoded / sub-delims )

477

# port = *DIGIT

478

#

479

# path-abempty = *( "/" segment )

480

# path-absolute = "/" [ segment-nz *( "/" segment ) ]

481

# path-rootless = segment-nz *( "/" segment )

482

# path-empty = 0<pchar>

483

#

484

# segment = *pchar

485

# segment-nz = 1*pchar

486

# pchar = unreserved / pct-encoded / sub-delims / ":" / "@"

487

#

488

# query = *( pchar / "/" / "?" )

489

# fragment = *( pchar / "/" / "?" )

490

#

491

# unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"

492

# pct-encoded = "%" HEXDIG HEXDIG

493

# sub-delims = "!" / "$" / "&" / "'" / "(" / ")"

494

# / "*" / "+" / "," / ";" / "="

495

#

496

# We only match a set of known scheme names. We don't handle

497

# IP-literal either.

498

#

499

# We will simplify "unreserved / pct-encoded / sub-delims" as the

500

# following regular expression:

501

# [-a-zA-Z0-9._~%!$&'()*+,;=]

502

#

503

# We also require that the path-rootless form not begin with a

504

# colon to avoid matching strings like "http::foo" (to avoid bug

505

# #40255).

506

#

507

# The path-empty pattern is not matched either, due to false

508

# positives.

509

#

510

# Some allowed URI punctuation characters will be trimmed if they

511

# appear at the end of the URI since they may be incidental in the

512

# flow of the text.

513

#

514

# apport has at one time produced query strings containing sqaure

515

# braces (that are not percent-encoded). In RFC 2986 they seem to be

516

# allowed by section 2.2 "Reserved Characters", yet section 3.4

517

# "Query" appears to provide a strict definition of the query string

518

# that would forbid square braces. Either way, links with

519

# non-percent-encoded square braces are being used on Launchpad so

520

# it's probably best to accomodate them.

521

522

possible_uri_re = r'''

523

\b

524

525

:

526

(?:

527

(?:

528

# "//" authority path-abempty

529

//

530

(?: # userinfo

531

[%(unreserved)s:]*

532

@

533

)?

534

(?: # host

535

\d+\.\d+\.\d+\.\d+ |

536

[%(unreserved)s]*

537

)

538

(?: # port

539

: \d*

540

)?

541

(?: / [%(unreserved)s:@]* )*

542

) | (?:

543

# path-absolute

544

/

545

(?: [%(unreserved)s:@]+

546

(?: / [%(unreserved)s:@]* )* )?

547

) | (?:

548

# path-rootless

549

[%(unreserved)s@]

550

[%(unreserved)s:@]*

551

(?: / [%(unreserved)s:@]* )*

552

)

553

)

554

(?: # query

555

\?

556

[%(unreserved)s:@/\?\[\]]*

557

)?

558

(?: # fragment

559

\#

560

[%(unreserved)s:@/\?]*

561

)?

562

''' % {'unreserved': "-a-zA-Z0-9._~%!$&'()*+,;="}

563

564

possible_uri_pat = re.compile(possible_uri_re, re.IGNORECASE | re.VERBOSE)

565

uri_trailers_pat = re.compile(r'([,.?:);>]+)$')

566

567

def find_uris_in_text(text):

568

"""Scan a block of text for URIs, and yield the ones found."""

569

for match in possible_uri_pat.finditer(text):

570

uri_string = match.group()

571

# remove characters from end of URI that are not likely to be

572

# part of the URI.

573

uri_string = uri_trailers_pat.sub('', uri_string)

574

try:

575

uri = URI(uri_string)

576

except InvalidURIError:

577

continue

578

yield uri