~breunigs/duplicity/amazondrive : revision 1153

1

# -*- Mode:Python; indent-tabs-mode:nil; tab-width:4 -*-

2

#

3

4

# Based on the backend onedrivebackend.py

5

#

6

# This file is part of duplicity.

7

#

8

# Duplicity is free software; you can redistribute it and/or modify it

9

# under the terms of the GNU General Public License as published by the

10

# Free Software Foundation; either version 2 of the License, or (at your

11

# option) any later version.

12

#

13

# Duplicity is distributed in the hope that it will be useful, but

14

# WITHOUT ANY WARRANTY; without even the implied warranty of

15

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

16

# General Public License for more details.

17

#

18

# You should have received a copy of the GNU General Public License

19

# along with duplicity; if not, write to the Free Software Foundation,

20

# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

21

22

import os.path

23

import json

24

import sys

25

import time

26

import re

27

from io import DEFAULT_BUFFER_SIZE

28

29

import duplicity.backend

30

from duplicity.errors import BackendException

31

from duplicity import globals

32

from duplicity import log

33

34

35

class ADBackend(duplicity.backend.Backend):

36

"""

37

Backend for Amazon Drive. It communicates directly with Amazon Drive using

38

their RESTful API and does not rely on externally setup software (like

39

acd_cli).

40

"""

41

42

OAUTH_TOKEN_PATH = os.path.expanduser('~/.duplicity_ad_oauthtoken.json')

43

44

OAUTH_AUTHORIZE_URL = 'https://www.amazon.com/ap/oa'

45

OAUTH_TOKEN_URL = 'https://api.amazon.com/auth/o2/token'

46

# NOTE: Amazon requires https, which is why I am using my domain/setup

47

# instead of Duplicity's. Mail me at stefan-duplicity@breunig.xyz once it is

48

# available through https and I will whitelist the new URL.

49

OAUTH_REDIRECT_URL = 'https://breunig.xyz/duplicity/copy.html'

50

OAUTH_SCOPE = ['clouddrive:read_other', 'clouddrive:write']

51

52

CLIENT_ID = 'amzn1.application-oa2-client.791c9c2d78444e85a32eb66f92eb6bcc'

53

CLIENT_SECRET = '5b322c6a37b25f16d848a6a556eddcc30314fc46ae65c87068ff1bc4588d715b'

54

55

MULTIPART_BOUNDARY = 'DuplicityFormBoundaryd66364f7f8924f7e9d478e19cf4b871d114a1e00262542'

56

57

def __init__(self, parsed_url):

58

duplicity.backend.Backend.__init__(self, parsed_url)

59

60

self.metadata_url = 'https://drive.amazonaws.com/drive/v1/'

61

self.content_url = 'https://content-na.drive.amazonaws.com/cdproxy/'

62

63

self.names_to_ids = {}

64

self.backup_target_id = None

65

self.backup_target = parsed_url.path.lstrip('/')

66

67

if globals.volsize > (10 * 1024 * 1024 * 1024):

68

# https://forums.developer.amazon.com/questions/22713/file-size-limits.html

69

# https://forums.developer.amazon.com/questions/22038/support-for-chunked-transfer-encoding.html

70

log.FatalError(

71

'Your --volsize is bigger than 10 GiB, which is the maximum '

72

'file size on Amazon Drive that does not require work arounds.')

73

74

try:

75

global requests

76

global OAuth2Session

77

import requests

78

from requests_oauthlib import OAuth2Session

79

except ImportError:

80

raise BackendException(

81

'Amazon Drive backend requires python-requests and '

82

'python-requests-oauthlib to be installed.\n\n'

83

'For Debian and derivates use:\n'

84

' apt-get install python-requests python-requests-oauthlib\n'

85

'For Fedora and derivates use:\n'

86

' yum install python-requests python-requests-oauthlib')

87

88

self.initialize_oauth2_session()

89

self.resolve_backup_target()

90

91

def initialize_oauth2_session(self):

92

"""Setup or refresh oauth2 session with Amazon Drive"""

93

94

def token_updater(token):

95

"""Stores oauth2 token on disk"""

96

try:

97

with open(self.OAUTH_TOKEN_PATH, 'w') as f:

98

json.dump(token, f)

99

except Exception as err:

100

log.Error('Could not save the OAuth2 token to %s. This means '

101

'you may need to do the OAuth2 authorization '

102

'process again soon. Original error: %s' % (

103

self.OAUTH_TOKEN_PATH, err))

104

105

token = None

106

try:

107

with open(self.OAUTH_TOKEN_PATH) as f:

108

token = json.load(f)

109

except IOError as err:

110

log.Notice('Could not load OAuth2 token. '

111

'Trying to create a new one. (original error: %s)' % err)

112

113

self.http_client = OAuth2Session(

114

self.CLIENT_ID,

115

scope=self.OAUTH_SCOPE,

116

redirect_uri=self.OAUTH_REDIRECT_URL,

117

token=token,

118

auto_refresh_kwargs={

119

'client_id': self.CLIENT_ID,

120

'client_secret': self.CLIENT_SECRET,

121

},

122

auto_refresh_url=self.OAUTH_TOKEN_URL,

123

token_updater=token_updater)

124

125

if token is not None:

126

self.http_client.refresh_token(self.OAUTH_TOKEN_URL)

127

128

endpoints_response = self.http_client.get(self.metadata_url +

129

'account/endpoint')

130

if endpoints_response.status_code != requests.codes.ok:

131

token = None

132

133

if token is None:

134

if not sys.stdout.isatty() or not sys.stdin.isatty():

135

log.FatalError('The OAuth2 token could not be loaded from %s '

136

'and you are not running duplicity '

137

'interactively, so duplicity cannot possibly '

138

'access Amazon Drive.' % self.OAUTH_TOKEN_PATH)

139

authorization_url, _ = self.http_client.authorization_url(

140

self.OAUTH_AUTHORIZE_URL)

141

142

print ''

143

print ('In order to allow duplicity to access Amazon Drive, please '

144

'open the following URL in a browser and copy the URL of the '

145

'page you see after authorization here:')

146

print authorization_url

147

print ''

148

149

redirected_to = (raw_input('URL of the resulting page: ')

150

.replace('http://', 'https://', 1))

151

152

token = self.http_client.fetch_token(

153

self.OAUTH_TOKEN_URL,

154

client_secret=self.CLIENT_SECRET,

155

authorization_response=redirected_to)

156

157

endpoints_response = self.http_client.get(self.metadata_url +

158

'account/endpoint')

159

endpoints_response.raise_for_status()

160

token_updater(token)

161

162

urls = endpoints_response.json()

163

if 'metadataUrl' not in urls or 'contentUrl' not in urls:

164

log.FatalError('Could not retrieve endpoint URLs for this account')

165

self.metadata_url = urls['metadataUrl']

166

self.content_url = urls['contentUrl']

167

168

def resolve_backup_target(self):

169

"""Resolve node id for remote backup target folder"""

170

171

response = self.http_client.get(

172

self.metadata_url + 'nodes?filters=kind:FOLDER AND isRoot:true')

173

parent_node_id = response.json()['data'][0]['id']

174

175

for component in [x for x in self.backup_target.split('/') if x]:

176

# There doesn't seem to be escaping support, so cut off filter

177

# after first unsupported character

178

query = re.search('^[A-Za-z0-9_-]*', component).group(0)

179

if component != query:

180

query = query + '*'

181

182

matches = self.read_all_pages(

183

self.metadata_url + 'nodes?filters=kind:FOLDER AND name:%s '

184

'AND parents:%s' % (query, parent_node_id))

185

candidates = [f for f in matches if f.get('name') == component]

186

187

if len(candidates) >= 2:

188

log.FatalError('There are multiple folders with the same name '

189

'below one parent.\nParentID: %s\nFolderName: '

190

'%s' % (parent_node_id, component))

191

elif len(candidates) == 1:

192

parent_node_id = candidates[0]['id']

193

else:

194

log.Debug('Folder %s does not exist yet. Creating.' % component)

195

parent_node_id = self.mkdir(parent_node_id, component)

196

197

log.Debug("Backup target folder has id: %s" % parent_node_id)

198

self.backup_target_id = parent_node_id

199

200

def get_file_id(self, remote_filename):

201

"""Find id of remote file in backup target folder"""

202

203

if remote_filename not in self.names_to_ids:

204

self._list()

205

206

return self.names_to_ids.get(remote_filename)

207

208

def mkdir(self, parent_node_id, folder_name):

209

"""Create a new folder as a child of a parent node"""

210

211

data = {'name': folder_name, 'parents': [parent_node_id], 'kind': 'FOLDER'}

212

response = self.http_client.post(

213

self.metadata_url + 'nodes',

214

data=json.dumps(data))

215

response.raise_for_status()

216

return response.json()['id']

217

218

def multipart_stream(self, metadata, source_path):

219

"""Generator for multipart/form-data file upload from source file"""

220

221

boundary = self.MULTIPART_BOUNDARY

222

223

yield str.encode('--%s\r\nContent-Disposition: form-data; '

224

'name="metadata"\r\n\r\n' % boundary +

225

'%s\r\n' % json.dumps(metadata) +

226

'--%s\r\n' % boundary)

227

yield b'Content-Disposition: form-data; name="content"; filename="i_love_backups"\r\n'

228

yield b'Content-Type: application/octet-stream\r\n\r\n'

229

230

with source_path.open() as stream:

231

while True:

232

f = stream.read(DEFAULT_BUFFER_SIZE)

233

if f:

234

yield f

235

else:

236

break

237

238

yield str.encode('\r\n--%s--\r\n' % boundary +

239

'multipart/form-data; boundary=%s' % boundary)

240

241

def read_all_pages(self, url):

242

"""Iterates over nodes API URL until all pages were read"""

243

244

result = []

245

next_token = ''

246

token_param = '&startToken=' if '?' in url else '?startToken='

247

248

while True:

249

paginated_url = url + token_param + next_token

250

response = self.http_client.get(paginated_url)

251

if response.status_code != 200:

252

raise BackendException("Pagination failed with status=%s on "

253

"URL=%s" % (response.status_code, url))

254

255

parsed = response.json()

256

if 'data' in parsed and len(parsed['data']) > 0:

257

result.extend(parsed['data'])

258

else:

259

break

260

261

# Do not make another HTTP request if everything is here already

262

if len(result) >= parsed['count']:

263

break

264

265

if 'nextToken' not in parsed:

266

break

267

next_token = parsed['nextToken']

268

269

return result

270

271

def raise_for_existing_file(self, remote_filename):

272

"""Report error when file already existed in location and delete it"""

273

274

self._delete(remote_filename)

275

raise BackendException('Upload failed, because there was a file with '

276

'the same name as %s already present. The file was '

277

'deleted, and duplicity will retry the upload unless '

278

'the retry limit has been reached.' % remote_filename)

279

280

def _put(self, source_path, remote_filename):

281

"""Upload a local file to Amazon Drive"""

282

283

quota = self.http_client.get(self.metadata_url + 'account/quota')

284

quota.raise_for_status()

285

available = quota.json()['available']

286

287

source_size = os.path.getsize(source_path.name)

288

289

if source_size > available:

290

raise BackendException(

291

'Out of space: trying to store "%s" (%d bytes), but only '

292

'%d bytes available on Amazon Drive.' % (

293

source_path.name, source_size, available))

294

295

# Just check the cached list, to avoid _list for every new file being

296

# uploaded

297

if remote_filename in self.names_to_ids:

298

log.Debug('File %s seems to already exist on Amazon Drive. Deleting '

299

'before attempting to upload it again.' % remote_filename)

300

self._delete(remote_filename)

301

302

metadata = {'name': remote_filename, 'kind': 'FILE',

303

'parents': [self.backup_target_id]}

304

headers = {'Content-Type': 'multipart/form-data; boundary=%s'

305

% self.MULTIPART_BOUNDARY}

306

data = self.multipart_stream(metadata, source_path)

307

308

response = self.http_client.post(

309

self.content_url + 'nodes?suppress=deduplication',

310

data=data,

311

headers=headers)

312

313

if response.status_code == 409: # "409 : Duplicate file exists."

314

self.raise_for_existing_file(remote_filename)

315

elif response.status_code == 201:

316

log.Debug('%s uploaded successfully' % remote_filename)

317

elif response.status_code == 408 or response.status_code == 504:

318

log.Info('%s upload failed with timeout status code=%d. Speculatively '

319

'waiting for %d seconds to see if Amazon Drive finished the '

320

'upload anyway' % (remote_filename, response.status_code,

321

globals.timeout))

322

tries = globals.timeout / 15

323

while tries >= 0:

324

tries -= 1

325

time.sleep(15)

326

327

remote_size = self._query(remote_filename)['size']

328

if source_size == remote_size:

329

log.Debug('Upload turned out to be successful after all.')

330

return

331

elif remote_size == -1:

332

log.Debug('Uploaded file is not yet there, %d tries left.'

333

% (tries + 1))

334

continue

335

else:

336

self.raise_for_existing_file(remote_filename)

337

raise BackendException('%s upload failed and file did not show up '

338

'within time limit.' % remote_filename)

339

else:

340

log.Debug('%s upload returned an undesirable status code %s'

341

% (remote_filename, response.status_code))

342

response.raise_for_status()

343

344

parsed = response.json()

345

if 'id' not in parsed:

346

raise BackendException('%s was uploaded, but returned JSON does not '

347

'contain ID of new file. Retrying.\nJSON:\n\n%s'

348

% (remote_filename, parsed))

349

350

# XXX: The upload may be considered finished before the file shows up

351

# in the file listing. As such, the following is required to avoid race

352

# conditions when duplicity calls _query or _list.

353

self.names_to_ids[parsed['name']] = parsed['id']

354

355

def _get(self, remote_filename, local_path):

356

"""Download file from Amazon Drive"""

357

358

with local_path.open('wb') as local_file:

359

file_id = self.get_file_id(remote_filename)

360

if file_id is None:

361

raise BackendException(

362

'File "%s" cannot be downloaded: it does not exist' %

363

remote_filename)

364

365

response = self.http_client.get(

366

self.content_url + '/nodes/' + file_id + '/content', stream=True)

367

response.raise_for_status()

368

for chunk in response.iter_content(chunk_size=DEFAULT_BUFFER_SIZE):

369

if chunk:

370

local_file.write(chunk)

371

local_file.flush()

372

373

def _query(self, remote_filename):

374

"""Retrieve file size info from Amazon Drive"""

375

376

file_id = self.get_file_id(remote_filename)

377

if file_id is None:

378

return {'size': -1}

379

response = self.http_client.get(self.metadata_url + 'nodes/' + file_id)

380

response.raise_for_status()

381

382

return {'size': response.json()['contentProperties']['size']}

383

384

def _list(self):

385

"""List files in Amazon Drive backup folder"""

386

387

files = self.read_all_pages(

388

self.metadata_url + 'nodes/' + self.backup_target_id +

389

'/children?filters=kind:FILE')

390

391

self.names_to_ids = {f['name']: f['id'] for f in files}

392

393

return self.names_to_ids.keys()

394

395

def _delete(self, remote_filename):

396

"""Delete file from Amazon Drive"""

397

398

file_id = self.get_file_id(remote_filename)

399

if file_id is None:

400

raise BackendException(

401

'File "%s" cannot be deleted: it does not exist' % (

402

remote_filename))

403

response = self.http_client.put(self.metadata_url + 'trash/' + file_id)

404

response.raise_for_status()

405

del self.names_to_ids[remote_filename]

406

407

duplicity.backend.register_backend('ad', AmazonDriveBackend)