3
#tooltool is a lookaside cache implemented in Python
4
#Copyright (C) 2011 John H. Ford <john@johnford.info>
6
#This program is free software; you can redistribute it and/or
7
#modify it under the terms of the GNU General Public License
8
#as published by the Free Software Foundation version 2
10
#This program is distributed in the hope that it will be useful,
11
#but WITHOUT ANY WARRANTY; without even the implied warranty of
12
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
#GNU General Public License for more details.
15
#You should have received a copy of the GNU General Public License
16
#along with this program; if not, write to the Free Software
17
#Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19
# An manifest file specifies files in that directory that are stored
20
# elsewhere. This file should only contain file in the directory
21
# which the manifest file resides in and it should be called 'manifest.manifest'
32
import simplejson as json # I hear simplejson is faster
36
log = logging.getLogger(__name__)
38
class FileRecordJSONEncoderException(Exception): pass
39
class InvalidManifest(Exception): pass
40
class ExceptionWithFilename(Exception):
41
def __init__(self, filename):
42
Exception.__init__(self)
43
self.filename = filename
45
class DigestMismatchException(ExceptionWithFilename): pass
46
class MissingFileException(ExceptionWithFilename): pass
48
class FileRecord(object):
49
def __init__(self, filename, size, digest, algorithm):
51
self.filename = filename
54
self.algorithm = algorithm
55
log.debug("creating %s 0x%x" % (self.__class__.__name__, id(self)))
57
def __eq__(self, other):
60
if self.filename == other.filename and \
61
self.size == other.size and \
62
self.digest == other.digest and \
63
self.algorithm == other.algorithm:
68
def __ne__(self, other):
69
return not self.__eq__(other)
75
return "%s.%s(filename='%s', size='%s', digest='%s', algorithm='%s')" % (__name__,
76
self.__class__.__name__,
77
self.filename, self.size, self.digest, self.algorithm)
80
# Doesn't check validity
81
return os.path.exists(self.filename)
83
def validate_size(self):
85
return self.size == os.path.getsize(self.filename)
87
log.debug("trying to validate size on a missing file, %s", self.filename)
88
raise MissingFileException(filename=self.filename)
90
def validate_digest(self):
92
with open(self.filename, 'rb') as f:
93
return self.digest == digest_file(f, self.algorithm)
95
log.debug("trying to validate digest on a missing file, %s', self.filename")
96
raise MissingFileException(filename=self.filename)
99
if self.validate_size():
100
if self.validate_digest():
105
if self.present() and self.validate():
106
return "'%s' is present and valid" % self.filename
108
return "'%s' is present and invalid" % self.filename
110
return "'%s' is absent" % self.filename
113
def create_file_record(filename, algorithm):
114
fo = open(filename, 'rb')
115
stored_filename = os.path.split(filename)[1]
116
fr = FileRecord(stored_filename, os.path.getsize(filename), digest_file(fo, algorithm), algorithm)
121
class FileRecordJSONEncoder(json.JSONEncoder):
122
def encode_file_record(self, obj):
123
if not issubclass(type(obj), FileRecord):
124
err = "FileRecordJSONEncoder is only for FileRecord and lists of FileRecords, not %s" % obj.__class__.__name__
126
raise FileRecordJSONEncoderException(err)
128
return {'filename': obj.filename, 'size': obj.size, 'algorithm': obj.algorithm, 'digest': obj.digest}
130
def default(self, f):
131
if issubclass(type(f), list):
134
record_list.append(self.encode_file_record(i))
137
return self.encode_file_record(f)
140
class FileRecordJSONDecoder(json.JSONDecoder):
141
"""I help the json module materialize a FileRecord from
142
a JSON file. I understand FileRecords and lists of
143
FileRecords. I ignore things that I don't expect for now"""
144
# TODO: make this more explicit in what it's looking for
145
# and error out on unexpected things
146
def process_file_records(self, obj):
147
if isinstance(obj, list):
150
record = self.process_file_records(i)
151
if issubclass(type(record), FileRecord):
152
record_list.append(record)
154
if isinstance(obj, dict) and \
155
len(obj.keys()) == 4 and \
156
obj.has_key('filename') and \
157
obj.has_key('size') and \
158
obj.has_key('algorithm') and \
159
obj.has_key('digest'):
160
rv = FileRecord(obj['filename'], obj['size'], obj['digest'], obj['algorithm'])
161
log.debug("materialized %s" % rv)
166
decoded = json.JSONDecoder.decode(self, s)
167
rv = self.process_file_records(decoded)
171
class Manifest(object):
173
valid_formats = ('json',)
175
def __init__(self, file_records=[]):
176
self.file_records = file_records
178
def __eq__(self, other):
181
if len(self.file_records) != len(other.file_records):
182
log.debug('Manifests differ in number of files')
184
#TODO: Lists in a different order should be equal
185
for record in range(0,len(self.file_records)):
186
if self.file_records[record] != other.file_records[record]:
187
log.debug('FileRecords differ, %s vs %s' % (self.file_records[record],
188
other.file_records[record]))
192
def __deepcopy__(self, memo):
193
# This is required for a deep copy
194
return Manifest(self.file_records[:])
197
return Manifest(self.file_records)
200
return Manifest(self.file_records[:])
203
return all(i.present() for i in self.file_records)
205
def validate_sizes(self):
206
return all(i.validate_size() for i in self.file_records)
208
def validate_digests(self):
209
return all(i.validate_digest() for i in self.file_records)
212
return all(i.validate() for i in self.file_records)
216
self.file_records.sort(key=lambda x: x.size)
218
def load(self, data_file, fmt='json'):
219
assert fmt in self.valid_formats
222
self.file_records.extend(json.load(data_file, cls=FileRecordJSONDecoder))
225
raise InvalidManifest("trying to read invalid manifest file")
227
def loads(self, data_string, fmt='json'):
228
assert fmt in self.valid_formats
231
self.file_records.extend(json.loads(data_string, cls=FileRecordJSONDecoder))
234
raise InvalidManifest("trying to read invalid manifest file")
236
def dump(self, output_file, fmt='json'):
237
assert fmt in self.valid_formats
240
rv = json.dump(self.file_records, output_file, indent=0, cls=FileRecordJSONEncoder)
241
print >> output_file, ''
244
def dumps(self, fmt='json'):
245
assert fmt in self.valid_formats
248
return json.dumps(self.file_records, cls=FileRecordJSONEncoder)
251
def digest_file(f, a):
252
"""I take a file like object 'f' and return a hex-string containing
253
of the result of the algorithm 'a' applied to 'f'."""
256
data = f.read(chunk_size)
259
data = f.read(chunk_size)
260
if hasattr(f, 'name'):
261
log.debug('hashed %s with %s to be %s', f.name, a, h.hexdigest())
263
log.debug('hashed a file with %s to be %s', a, h.hexdigest())
266
# TODO: write tests for this function
267
def open_manifest(manifest_file):
268
"""I know how to take a filename and load it into a Manifest object"""
269
if os.path.exists(manifest_file):
270
manifest = Manifest()
271
with open(manifest_file) as f:
273
log.debug("loaded manifest from file '%s'" % manifest_file)
276
log.debug("tried to load absent file '%s' as manifest" % manifest_file)
277
raise InvalidManifest("manifest file '%s' does not exist" % manifest_file)
279
# TODO: write tests for this function
280
def list_manifest(manifest_file):
281
"""I know how print all the files in a location"""
283
manifest = open_manifest(manifest_file)
284
except InvalidManifest:
285
log.error("failed to load manifest file at '%s'" % manifest_file)
287
for f in manifest.file_records:
288
print "%s\t%s\t%s" % ("P" if f.present() else "-",
289
"V" if f.present() and f.validate() else "-",
293
def validate_manifest(manifest_file):
294
"""I validate that all files in a manifest are present and valid but
295
don't fetch or delete them if they aren't"""
297
manifest = open_manifest(manifest_file)
298
except InvalidManifest:
299
log.error("failed to load manifest file at '%s'" % manifest_file)
303
for f in manifest.file_records:
305
absent_files.append(f)
308
invalid_files.append(f)
309
if len(invalid_files + absent_files) == 0:
314
# TODO: write tests for this function
315
def add_files(manifest_file, algorithm, filenames):
316
# returns True if all files successfully added, False if not
317
# and doesn't catch library Exceptions. If any files are already
318
# tracked in the manifest, return will be False because they weren't
320
all_files_added = True
321
# Create a old_manifest object to add to
322
if os.path.exists(manifest_file):
323
old_manifest = open_manifest(manifest_file)
325
old_manifest = Manifest()
326
log.debug("creating a new manifest file")
327
new_manifest = Manifest() # use a different manifest for the output
328
for filename in filenames:
329
log.debug("adding %s" % filename)
330
path, name = os.path.split(filename)
331
new_fr = create_file_record(filename, algorithm)
332
log.debug("appending a new file record to manifest file")
334
for fr in old_manifest.file_records:
335
log.debug("manifest file has '%s'" % "', ".join([x.filename for x in old_manifest.file_records]))
336
if new_fr == fr and new_fr.validate():
337
# TODO: Decide if this case should really cause a False return
338
log.info("file already in old_manifest file and matches")
340
elif new_fr == fr and not new_fr.validate():
341
log.error("file already in old_manifest file but is invalid")
343
if filename == fr.filename:
344
log.error("manifest already contains file named %s" % filename)
347
new_manifest.file_records.append(new_fr)
348
log.debug("added '%s' to manifest" % filename)
350
all_files_added = False
351
with open(manifest_file, 'wb') as output:
352
new_manifest.dump(output, fmt='json')
353
return all_files_added
356
# TODO: write tests for this function
357
def fetch_file(base_url, file_record, overwrite=False, grabchunk=1024*4):
358
# A file which is requested to be fetched that exists locally will be hashed.
359
# If the hash matches the requested file's hash, nothing will be done and the
360
# function will return. If the function is told to overwrite and there is a
361
# digest mismatch, the exiting file will be overwritten
362
if file_record.present():
363
if file_record.validate():
364
log.info("existing '%s' is valid, not fetching" % file_record.filename)
367
log.info("overwriting '%s' as requested" % file_record.filename)
369
# All of the following is for a useful error message
370
with open(file_record.filename, 'rb') as f:
371
d = digest_file(f, file_record.algorithm)
372
log.error("digest mismatch between manifest(%s...) and local file(%s...)" % \
373
(file_record.digest[:8], d[:8]))
374
log.debug("full digests: manifest (%s) local file (%s)" % (file_record.digest, d))
378
# Generate the URL for the file on the server side
379
url = "%s/%s/%s" % (base_url, file_record.algorithm, file_record.digest)
381
log.debug("fetching from '%s'" % url)
383
# TODO: This should be abstracted to make generic retreival protocol handling easy
384
# Well, the file doesn't exist locally. Lets fetch it.
386
f = urllib2.urlopen(url)
387
log.debug("opened %s for reading" % url)
388
with open(file_record.filename, 'wb') as out:
392
# TODO: print statistics as file transfers happen both for info and to stop
394
indata = f.read(grabchunk)
399
if size != file_record.size:
400
log.error("transfer from %s to %s failed due to a difference of %d bytes" % (url,
401
file_record.filename, file_record.size - size))
403
log.info("fetched %s" % file_record.filename)
404
except (urllib2.URLError, urllib2.HTTPError) as e:
405
log.error("failed to fetch '%s': %s" % (file_record.filename, e),
409
log.error("failed to write to '%s'" % file_record.filename,
415
# TODO: write tests for this function
416
def fetch_files(manifest_file, base_url, overwrite, filenames=[]):
417
# Lets load the manifest file
419
manifest = open_manifest(manifest_file)
420
except InvalidManifest:
421
log.error("failed to load manifest file at '%s'" % manifest_file)
423
# We want to track files that fail to be fetched as well as
424
# files that are fetched
427
# Lets go through the manifest and fetch the files that we want
429
for f in manifest.file_records:
430
if f.filename in filenames or len(filenames) == 0:
431
log.debug("fetching %s" % f.filename)
432
if fetch_file(base_url, f, overwrite):
433
fetched_files.append(f)
435
failed_files.append(f.filename)
437
log.debug("skipping %s" % f.filename)
439
# Even if we get the file, lets ensure that it matches what the
441
for localfile in fetched_files:
442
if not localfile.validate():
443
log.error("'%s'" % localfile.describe())
445
# If we failed to fetch or validate a file, we need to fail
446
if len(failed_files) > 0:
447
log.error("The following files failed: '%s'" % "', ".join(failed_files))
452
# TODO: write tests for this function
453
def process_command(options, args):
454
""" I know how to take a list of program arguments and
455
start doing the right thing with them"""
458
log.debug("processing '%s' command with args '%s'" % (cmd, '", "'.join(cmd_args)))
459
log.debug("using options: %s" % options)
461
return list_manifest(options['manifest'])
462
if cmd == 'validate':
463
return validate_manifest(options['manifest'])
465
return add_files(options['manifest'], options['algorithm'], cmd_args)
467
if not options.has_key('base_url') or options.get('base_url') is None:
468
log.critical('fetch command requires url option')
470
return fetch_files(options['manifest'], options['base_url'], options['overwrite'], cmd_args)
472
log.critical('command "%s" is not implemented' % cmd)
476
# http://hostname/algorithm/hash
477
# example: http://people.mozilla.org/sha1/1234567890abcedf
478
# This will make it possible to have the server allow clients to
479
# use different algorithms than what was uploaded to the server
481
# TODO: Implement the following features:
482
# -optimization: do small files first, justification is that they are faster
483
# and cause a faster failure if they are invalid
485
# -local renames i.e. call the file one thing on the server and
486
# something different locally
487
# -deal with the cases:
488
# -local data matches file requested with different filename
489
# -two different files with same name, different hash
490
# -?only ever locally to digest as filename, symlink to real name
491
# -?maybe deal with files as a dir of the filename with all files in that dir as the versions of that file
492
# - e.g. ./python-2.6.7.dmg/0123456789abcdef and ./python-2.6.7.dmg/abcdef0123456789
495
# Set up logging, for now just to the console
496
ch = logging.StreamHandler()
497
cf = logging.Formatter("%(levelname)s - %(message)s")
500
# Set up option parsing
501
parser = optparse.OptionParser()
502
# I wish there was a way to say "only allow args to be
503
# sequential and at the end of the argv.
504
# OH! i could step through sys.argv and check for things starting without -/-- before things starting with them
505
parser.add_option('-q', '--quiet', default=False,
506
dest='quiet', action='store_true')
507
parser.add_option('-v', '--verbose', default=False,
508
dest='verbose', action='store_true')
509
parser.add_option('-m', '--manifest', default='manifest.tt',
510
dest='manifest', action='store',
511
help='specify the manifest file to be operated on')
512
parser.add_option('-d', '--algorithm', default='sha512',
513
dest='algorithm', action='store',
514
help='openssl hashing algorithm to use')
515
parser.add_option('-o', '--overwrite', default=False,
516
dest='overwrite', action='store_true',
517
help='if fetching, remote copy will overwrite a local copy that is different. ')
518
parser.add_option('--url', dest='base_url', action='store',
519
help='base url for fetching files')
520
parser.add_option('--ignore-config-files', action='store_true', default=False,
521
dest='ignore_cfg_files')
522
(options_obj, args) = parser.parse_args()
523
# Dictionaries are easier to work with
524
options = vars(options_obj)
527
# Use some of the option parser to figure out application
529
if options.get('verbose'):
530
ch.setLevel(logging.DEBUG)
531
elif options.get('quiet'):
532
ch.setLevel(logging.ERROR)
534
ch.setLevel(logging.INFO)
537
cfg_file = ConfigParser.SafeConfigParser()
538
if not options.get("ignore_cfg_files"):
539
read_files = cfg_file.read(['/etc/tooltool', os.path.expanduser('~/.tooltool'),
540
os.path.join(os.getcwd(), '.tooltool')])
541
log.debug("read in the config files '%s'" % '", '.join(read_files))
543
log.debug("skipping config files")
545
for option in ('base_url', 'algorithm'):
546
if not options.get(option):
548
options[option] = cfg_file.get('general', option)
549
log.debug("read '%s' as '%s' from cfg_file" % (option, options[option]))
550
except (ConfigParser.NoSectionError, ConfigParser.NoOptionError) as e:
551
log.debug("%s in config file" % e, exc_info=True)
553
if not options.has_key('manifest'):
554
parser.error("no manifest file specified")
557
parser.error('You must specify a command')
558
exit(0 if process_command(options, args) else 1)
560
if __name__ == "__main__":
563
log.addHandler(logging.NullHandler())
564
#log.addHandler(logging.StreamHandler())