~mterry/duplicity/require-2.6 : revision 972

1

# -*- Mode:Python; indent-tabs-mode:nil; tab-width:4 -*-

2

3

"""Parse (absolute and relative) URLs.

4

5

See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,

6

UC Irvine, June 1995.

7

"""

8

9

__all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",

10

"urlsplit", "urlunsplit"]

11

12

# A classification of schemes ('' means apply by default)

13

uses_relative = ['ftp', 'ftps', 'http', 'gopher', 'nntp',

14

'wais', 'file', 'https', 'shttp', 'mms',

15

'prospero', 'rtsp', 'rtspu', '', 'sftp', 'imap', 'imaps']

16

uses_netloc = ['ftp', 'ftps', 'http', 'gopher', 'nntp', 'telnet',

17

'wais', 'file', 'mms', 'https', 'shttp',

18

'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',

19

'svn', 'svn+ssh', 'sftp', 'imap', 'imaps']

20

non_hierarchical = ['gopher', 'hdl', 'mailto', 'news',

21

'telnet', 'wais', 'snews', 'sip', 'sips', 'imap', 'imaps']

22

uses_params = ['ftp', 'ftps', 'hdl', 'prospero', 'http',

23

'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips',

24

'mms', '', 'sftp', 'imap', 'imaps']

25

uses_query = ['http', 'wais', 'https', 'shttp', 'mms',

26

'gopher', 'rtsp', 'rtspu', 'sip', 'sips', 'imap', 'imaps', '']

27

uses_fragment = ['ftp', 'ftps', 'hdl', 'http', 'gopher', 'news',

28

'nntp', 'wais', 'https', 'shttp', 'snews',

29

'file', 'prospero', '']

30

31

# Characters valid in scheme names

32

scheme_chars = ('abcdefghijklmnopqrstuvwxyz'

33

'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

34

'0123456789'

35

'+-.')

36

37

MAX_CACHE_SIZE = 20

38

_parse_cache = {}

39

40

def clear_cache():

41

"""Clear the parse cache."""

42

global _parse_cache

43

_parse_cache = {}

44

45

import string

46

def _rsplit(str, delim, numsplit):

47

parts = string.split(str, delim)

48

if len(parts) <= numsplit + 1:

49

return parts

50

else:

51

left = string.join(parts[0:-numsplit], delim)

52

right = string.join(parts[len(parts)-numsplit:], delim)

53

return [left, right]

54

55

class BaseResult(tuple):

56

"""Base class for the parsed result objects.

57

58

This provides the attributes shared by the two derived result

59

objects as read-only properties. The derived classes are

60

responsible for checking the right number of arguments were

61

supplied to the constructor.

62

63

"""

64

65

__slots__ = ()

66

67

# Attributes that access the basic components of the URL:

68

69

def get_scheme(self):

70

return self[0]

71

scheme = property(get_scheme)

72

73

def get_netloc(self):

74

return self[1]

75

netloc = property(get_netloc)

76

77

def get_path(self):

78

return self[2]

79

path = property(get_path)

80

81

def get_query(self):

82

return self[-2]

83

query = property(get_query)

84

85

def get_fragment(self):

86

return self[-1]

87

fragment = property(get_fragment)

88

89

# Additional attributes that provide access to parsed-out portions

90

# of the netloc:

91

92

def get_username(self):

93

netloc = self.netloc

94

if "@" in netloc:

95

userinfo = _rsplit(netloc, "@", 1)[0]

96

if ":" in userinfo:

97

userinfo = userinfo.split(":", 1)[0]

98

return userinfo

99

return None

100

username = property(get_username)

101

102

def get_password(self):

103

netloc = self.netloc

104

if "@" in netloc:

105

userinfo = _rsplit(netloc, "@", 1)[0]

106

if ":" in userinfo:

107

return userinfo.split(":", 1)[1]

108

return None

109

password = property(get_password)

110

111

def get_hostname(self):

112

netloc = self.netloc.split('@')[-1]

113

if '[' in netloc and ']' in netloc:

114

return netloc.split(']')[0][1:].lower()

115

elif ':' in netloc:

116

return netloc.split(':')[0].lower()

117

elif netloc == '':

118

return None

119

else:

120

return netloc.lower()

121

hostname = property(get_hostname)

122

123

def get_port(self):

124

netloc = self.netloc.split('@')[-1].split(']')[-1]

125

if ":" in netloc:

126

port = netloc.split(":", 1)[1]

127

return int(port, 10)

128

return None

129

port = property(get_port)

130

131

132

class SplitResult(BaseResult):

133

134

__slots__ = ()

135

136

def __new__(cls, scheme, netloc, path, query, fragment):

137

return BaseResult.__new__(

138

cls, (scheme, netloc, path, query, fragment))

139

140

def geturl(self):

141

return urlunsplit(self)

142

143

144

class ParseResult(BaseResult):

145

146

__slots__ = ()

147

148

def __new__(cls, scheme, netloc, path, params, query, fragment):

149

return BaseResult.__new__(

150

cls, (scheme, netloc, path, params, query, fragment))

151

152

def get_params(self):

153

return self[3]

154

params = property(get_params)

155

156

def geturl(self):

157

return urlunparse(self)

158

159

160

def urlparse(url, scheme='', allow_fragments=True):

161

"""Parse a URL into 6 components:

162

163

Return a 6-tuple: (scheme, netloc, path, params, query, fragment).

164

Note that we don't break the components up in smaller bits

165

(e.g. netloc is a single string) and we don't expand % escapes."""

166

tuple = urlsplit(url, scheme, allow_fragments)

167

scheme, netloc, url, query, fragment = tuple

168

if scheme in uses_params and ';' in url:

169

url, params = _splitparams(url)

170

else:

171

params = ''

172

return ParseResult(scheme, netloc, url, params, query, fragment)

173

174

def _splitparams(url):

175

if '/' in url:

176

i = url.find(';', url.rfind('/'))

177

if i < 0:

178

return url, ''

179

else:

180

i = url.find(';')

181

return url[:i], url[i+1:]

182

183

def _splitnetloc(url, start=0):

184

for c in '/?#': # the order is important!

185

delim = url.find(c, start)

186

if delim >= 0:

187

break

188

else:

189

delim = len(url)

190

return url[start:delim], url[delim:]

191

192

def urlsplit(url, scheme='', allow_fragments=True):

193

"""Parse a URL into 5 components:

194

195

Return a 5-tuple: (scheme, netloc, path, query, fragment).

196

Note that we don't break the components up in smaller bits

197

(e.g. netloc is a single string) and we don't expand % escapes."""

198

allow_fragments = bool(allow_fragments)

199

key = url, scheme, allow_fragments

200

cached = _parse_cache.get(key, None)

201

if cached:

202

return cached

203

if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth

204

clear_cache()

205

netloc = query = fragment = ''

206

i = url.find(':')

207

if i > 0:

208

if url[:i] == 'http': # optimize the common case

209

scheme = url[:i].lower()

210

url = url[i+1:]

211

if url[:2] == '//':

212

netloc, url = _splitnetloc(url, 2)

213

if allow_fragments and '#' in url:

214

url, fragment = url.split('#', 1)

215

if '?' in url:

216

url, query = url.split('?', 1)

217

v = SplitResult(scheme, netloc, url, query, fragment)

218

_parse_cache[key] = v

219

return v

220

for c in url[:i]:

221

if c not in scheme_chars:

222

break

223

else:

224

scheme, url = url[:i].lower(), url[i+1:]

225

if scheme in uses_netloc and url[:2] == '//':

226

netloc, url = _splitnetloc(url, 2)

227

if allow_fragments and scheme in uses_fragment and '#' in url:

228

url, fragment = url.split('#', 1)

229

if scheme in uses_query and '?' in url:

230

url, query = url.split('?', 1)

231

v = SplitResult(scheme, netloc, url, query, fragment)

232

_parse_cache[key] = v

233

return v

234

235

def urlunparse((scheme, netloc, url, params, query, fragment)):

236

"""Put a parsed URL back together again. This may result in a

237

slightly different, but equivalent URL, if the URL that was parsed

238

originally had redundant delimiters, e.g. a ? with an empty query

239

(the draft states that these are equivalent)."""

240

if params:

241

url = "%s;%s" % (url, params)

242

return urlunsplit((scheme, netloc, url, query, fragment))

243

244

def urlunsplit((scheme, netloc, url, query, fragment)):

245

if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):

246

if url and url[:1] != '/': url = '/' + url

247

url = '//' + (netloc or '') + url

248

if scheme:

249

url = scheme + ':' + url

250

if query:

251

url = url + '?' + query

252

if fragment:

253

url = url + '#' + fragment

254

return url

255

256

def urljoin(base, url, allow_fragments=True):

257

"""Join a base URL and a possibly relative URL to form an absolute

258

interpretation of the latter."""

259

if not base:

260

return url

261

if not url:

262

return base

263

bscheme, bnetloc, bpath, bparams, bquery, bfragment = urlparse(base, '', allow_fragments) #@UnusedVariable

264

scheme, netloc, path, params, query, fragment = urlparse(url, bscheme, allow_fragments)

265

if scheme != bscheme or scheme not in uses_relative:

266

return url

267

if scheme in uses_netloc:

268

if netloc:

269

return urlunparse((scheme, netloc, path,

270

params, query, fragment))

271

netloc = bnetloc

272

if path[:1] == '/':

273

return urlunparse((scheme, netloc, path,

274

params, query, fragment))

275

if not (path or params or query):

276

return urlunparse((scheme, netloc, bpath,

277

bparams, bquery, fragment))

278

segments = bpath.split('/')[:-1] + path.split('/')

279

# XXX The stuff below is bogus in various ways...

280

if segments[-1] == '.':

281

segments[-1] = ''

282

while '.' in segments:

283

segments.remove('.')

284

while 1:

285

i = 1

286

n = len(segments) - 1

287

while i < n:

288

if (segments[i] == '..'

289

and segments[i-1] not in ('', '..')):

290

del segments[i-1:i+1]

291

break

292

i = i+1

293

else:

294

break

295

if segments == ['', '..']:

296

segments[-1] = ''

297

elif len(segments) >= 2 and segments[-1] == '..':

298

segments[-2:] = ['']

299

return urlunparse((scheme, netloc, '/'.join(segments),

300

params, query, fragment))

301

302

def urldefrag(url):

303

"""Removes any existing fragment from URL.

304

305

Returns a tuple of the defragmented URL and the fragment. If

306

the URL contained no fragments, the second element is the

307

empty string.

308

"""

309

if '#' in url:

310

s, n, p, a, q, frag = urlparse(url)

311

defrag = urlunparse((s, n, p, a, q, ''))

312

return defrag, frag

313

else:

314

return url, ''

315

316

317

test_input = """

318

http://a/b/c/d

319

320

g:h = <URL:g:h>

321

http:g = <URL:http://a/b/c/g>

322

http: = <URL:http://a/b/c/d>

323

g = <URL:http://a/b/c/g>

324

./g = <URL:http://a/b/c/g>

325

g/ = <URL:http://a/b/c/g/>

326

/g = <URL:http://a/g>

327

//g = <URL:http://g>

328

?y = <URL:http://a/b/c/d?y>

329

g?y = <URL:http://a/b/c/g?y>

330

g?y/./x = <URL:http://a/b/c/g?y/./x>

331

. = <URL:http://a/b/c/>

332

./ = <URL:http://a/b/c/>

333

.. = <URL:http://a/b/>

334

../ = <URL:http://a/b/>

335

../g = <URL:http://a/b/g>

336

../.. = <URL:http://a/>

337

../../g = <URL:http://a/g>

338

../../../g = <URL:http://a/../g>

339

./../g = <URL:http://a/b/g>

340

./g/. = <URL:http://a/b/c/g/>

341

/./g = <URL:http://a/./g>

342

g/./h = <URL:http://a/b/c/g/h>

343

g/../h = <URL:http://a/b/c/h>

344

http:g = <URL:http://a/b/c/g>

345

http: = <URL:http://a/b/c/d>

346

http:?y = <URL:http://a/b/c/d?y>

347

http:g?y = <URL:http://a/b/c/g?y>

348

http:g?y/./x = <URL:http://a/b/c/g?y/./x>

349

"""

350

351

def test():

352

import sys

353

base = ''

354

if sys.argv[1:]:

355

fn = sys.argv[1]

356

if fn == '-':

357

fp = sys.stdin

358

else:

359

fp = open(fn)

360

else:

361

try:

362

from cStringIO import StringIO

363

except ImportError:

364

from StringIO import StringIO

365

fp = StringIO(test_input)

366

while 1:

367

line = fp.readline()

368

if not line: break

369

words = line.split()

370

if not words:

371

continue

372

url = words[0]

373

parts = urlparse(url)

374

print '%-10s : %s' % (url, parts)

375

abs = urljoin(base, url)

376

if not base:

377

base = abs

378

wrapped = '<URL:%s>' % abs

379

print '%-10s = %s' % (url, wrapped)

380

if len(words) == 3 and words[1] == '=':

381

if wrapped != words[2]:

382

print 'EXPECTED', words[2], '!!!!!!!!!!'

383

384

if __name__ == '__main__':

385

test()