1
# This Source Code Form is subject to the terms of the Mozilla Public
2
# License, v. 2.0. If a copy of the MPL was not distributed with this
3
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
5
import sys, os, subprocess, struct, re
8
("signature", "uint32"),
9
("min_version", "uint16"),
10
("general_flag", "uint16"),
11
("compression", "uint16"),
12
("lastmod_time", "uint16"),
13
("lastmod_date", "uint16"),
15
("compressed_size", "uint32"),
16
("uncompressed_size", "uint32"),
17
("filename_size", "uint16"),
18
("extra_field_size", "uint16"),
19
("filename", "filename_size"),
20
("extra_field", "extra_field_size"),
21
("data", "compressed_size")
25
("signature", "uint32"),
26
("creator_version", "uint16"),
27
("min_version", "uint16"),
28
("general_flag", "uint16"),
29
("compression", "uint16"),
30
("lastmod_time", "uint16"),
31
("lastmod_date", "uint16"),
33
("compressed_size", "uint32"),
34
("uncompressed_size", "uint32"),
35
("filename_size", "uint16"),
36
("extrafield_size", "uint16"),
37
("filecomment_size", "uint16"),
38
("disknum", "uint16"),
39
("internal_attr", "uint16"),
40
("external_attr", "uint32"),
42
("filename", "filename_size"),
43
("extrafield", "extrafield_size"),
44
("filecomment", "filecomment_size"),
48
("signature", "uint32"),
49
("disk_num", "uint16"),
50
("cdir_disk", "uint16"),
51
("disk_entries", "uint16"),
52
("cdir_entries", "uint16"),
53
("cdir_size", "uint32"),
54
("cdir_offset", "uint32"),
55
("comment_size", "uint16"),
58
type_mapping = { "uint32":"I", "uint16":"H"}
60
def format_struct (format):
63
for (name,value) in iter(format):
65
fmt += type_mapping[value][0]
67
string_fields[name] = value
68
return (fmt, string_fields)
71
return struct.calcsize(format_struct(format)[0])
74
def __init__(self, format, string_fields):
75
self.__dict__["struct_members"] = {}
76
self.__dict__["format"] = format
77
self.__dict__["string_fields"] = string_fields
79
def addMember(self, name, value):
80
self.__dict__["struct_members"][name] = value
82
def __getattr__(self, item):
84
return self.__dict__["struct_members"][item]
88
print(self.__dict__["struct_members"])
91
def __setattr__(self, item, value):
92
if item in self.__dict__["struct_members"]:
93
self.__dict__["struct_members"][item] = value
100
string_fields = self.__dict__["string_fields"]
101
struct_members = self.__dict__["struct_members"]
102
format = self.__dict__["format"]
103
for (name,_) in format:
104
if name in string_fields:
105
extra_data = extra_data + struct_members[name]
107
values.append(struct_members[name]);
108
return struct.pack(format_struct(format)[0], *values) + extra_data
112
def assert_true(cond, msg):
118
def __init__(self, f):
119
self.data = open(f, "rb").read()
121
self.length = len(self.data)
123
def readAt(self, pos, length):
124
self.offset = pos + length
125
return self.data[pos:self.offset]
127
def read_struct (self, format, offset = None):
130
(fstr, string_fields) = format_struct(format)
131
size = struct.calcsize(fstr)
132
data = self.readAt(offset, size)
133
ret = struct.unpack(fstr, data)
134
retstruct = MyStruct(format, string_fields)
136
for (name,_) in iter(format):
138
if not name in string_fields:
142
# zip has data fields which are described by other struct fields, this does
143
# additional reads to fill em in
144
member_desc = string_fields[name]
145
member_data = self.readAt(self.offset, retstruct.__getattr__(member_desc))
146
retstruct.addMember(name, member_data)
147
# sanity check serialization code
148
data = self.readAt(offset, self.offset - offset)
149
out_data = retstruct.pack()
150
assert_true(out_data == data, "Serialization fail %d !=%d"% (len(out_data), len(data)))
153
def optimizejar(jar, outjar, inlog = None):
154
if inlog is not None:
155
inlog = open(inlog).read().rstrip()
156
# in the case of an empty log still move the index forward
160
inlog = inlog.split("\n")
162
jarblob = BinaryBlob(jar)
163
dirend = jarblob.read_struct(cdir_end, jarblob.length - size_of(cdir_end))
164
assert_true(dirend.signature == ENDSIG, "no signature in the end");
165
cdir_offset = dirend.cdir_offset
167
if inlog is None and cdir_offset == 4:
168
readahead = struct.unpack("<I", jarblob.readAt(0, 4))[0]
169
print("%s: startup data ends at byte %d" % (outjar, readahead));
172
jarblob.offset = cdir_offset
173
central_directory = []
174
for i in range(0, dirend.cdir_entries):
175
entry = jarblob.read_struct(cdir_entry)
176
if entry.filename[-1:] == "/":
177
total_stripped += len(entry.pack())
179
total_stripped += entry.extrafield_size
180
central_directory.append(entry)
183
if inlog is not None:
185
for ordered_name in inlog:
186
if ordered_name in dup_guard:
189
dup_guard.add(ordered_name)
191
for i in range(reordered_count, len(central_directory)):
192
if central_directory[i].filename == ordered_name:
193
# swap the cdir entries
194
tmp = central_directory[i]
195
central_directory[i] = central_directory[reordered_count]
196
central_directory[reordered_count] = tmp
197
reordered_count = reordered_count + 1
201
print( "Can't find '%s' in %s" % (ordered_name, jar))
203
outfd = open(outjar, "wb")
205
if inlog is not None:
206
# have to put central directory at offset 4 cos 0 confuses some tools.
207
# This also lets us specify how many entries should be preread
208
dirend.cdir_offset = 4
209
# make room for central dir + end of dir + 4 extra bytes at front
210
out_offset = dirend.cdir_offset + dirend.cdir_size + size_of(cdir_end) - total_stripped
211
outfd.seek(out_offset)
218
# store number of bytes suggested for readahead
219
for entry in central_directory:
220
# read in the header twice..first for comparison, second time for convenience when writing out
221
jarfile = jarblob.read_struct(local_file_header, entry.offset)
222
assert_true(jarfile.filename == entry.filename, "Directory/Localheader mismatch")
223
# drop directory entries
224
if entry.filename[-1:] == "/":
225
total_stripped += len(jarfile.pack())
226
dirend.cdir_entries -= 1
228
# drop extra field data
230
total_stripped += jarfile.extra_field_size;
231
entry.extrafield = jarfile.extra_field = ""
232
entry.extrafield_size = jarfile.extra_field_size = 0
234
entry.lastmod_date = jarfile.lastmod_date = ((2010 - 1980) << 9) | (1 << 5) | 1
235
entry.lastmod_time = jarfile.lastmod_time = 0
236
data = jarfile.pack()
238
old_entry_offset = entry.offset
239
entry.offset = out_offset
240
out_offset = out_offset + len(data)
241
entry_data = entry.pack()
242
cdir_data += entry_data
243
expected_len = entry.filename_size + entry.extrafield_size + entry.filecomment_size
244
assert_true(len(entry_data) != expected_len,
245
"%s entry size - expected:%d got:%d" % (entry.filename, len(entry_data), expected_len))
248
if entry.crc32 in crc_mapping:
250
dupe_bytes += entry.compressed_size + len(data) + len(entry_data)
251
print("%s\n\tis a duplicate of\n%s\n---"%(entry.filename, crc_mapping[entry.crc32]))
253
crc_mapping[entry.crc32] = entry.filename;
255
if inlog is not None:
256
if written_count == reordered_count:
257
readahead = out_offset
258
print("%s: startup data ends at byte %d"%( outjar, readahead));
259
elif written_count < reordered_count:
261
#print("%s @ %d" % (entry.filename, out_offset))
262
elif readahead >= old_entry_offset + len(data):
263
outlog.append(entry.filename)
267
dirend.cdir_offset = out_offset
270
print("WARNING: Found %d duplicate files taking %d bytes"%(dups_found, dupe_bytes))
272
dirend.cdir_size = len(cdir_data)
273
dirend.disk_entries = dirend.cdir_entries
274
dirend_data = dirend.pack()
275
assert_true(size_of(cdir_end) == len(dirend_data), "Failed to serialize directory end correctly. Serialized size;%d, expected:%d"%(len(dirend_data), size_of(cdir_end)));
277
outfd.seek(dirend.cdir_offset)
278
outfd.write(cdir_data)
279
outfd.write(dirend_data)
281
# for ordered jars the central directory is written in the begining of the file, so a second central-directory
282
# entry has to be written in the end of the file
283
if inlog is not None:
285
outfd.write(struct.pack("<I", readahead));
286
outfd.seek(out_offset)
287
outfd.write(dirend_data)
289
print "Stripped %d bytes" % total_stripped
290
print "%s %d/%d in %s" % (("Ordered" if inlog is not None else "Deoptimized"),
291
reordered_count, len(central_directory), outjar)
295
if len(sys.argv) != 5:
296
print "Usage: --optimize|--deoptimize %s JAR_LOG_DIR IN_JAR_DIR OUT_JAR_DIR" % sys.argv[0]
299
jar_regex = re.compile("\\.jar?$")
301
def optimize(JAR_LOG_DIR, IN_JAR_DIR, OUT_JAR_DIR):
302
ls = os.listdir(IN_JAR_DIR)
304
if not re.search(jar_regex, jarfile):
306
injarfile = os.path.join(IN_JAR_DIR, jarfile)
307
outjarfile = os.path.join(OUT_JAR_DIR, jarfile)
308
logfile = os.path.join(JAR_LOG_DIR, jarfile + ".log")
309
if not os.path.isfile(logfile):
311
optimizejar(injarfile, outjarfile, logfile)
313
def deoptimize(JAR_LOG_DIR, IN_JAR_DIR, OUT_JAR_DIR):
314
if not os.path.exists(JAR_LOG_DIR):
315
os.makedirs(JAR_LOG_DIR)
317
ls = os.listdir(IN_JAR_DIR)
319
if not re.search(jar_regex, jarfile):
321
injarfile = os.path.join(IN_JAR_DIR, jarfile)
322
outjarfile = os.path.join(OUT_JAR_DIR, jarfile)
323
logfile = os.path.join(JAR_LOG_DIR, jarfile + ".log")
324
log = optimizejar(injarfile, outjarfile, None)
325
open(logfile, "wb").write("\n".join(log))
329
JAR_LOG_DIR = sys.argv[2]
330
IN_JAR_DIR = sys.argv[3]
331
OUT_JAR_DIR = sys.argv[4]
332
if MODE == "--optimize":
333
optimize(JAR_LOG_DIR, IN_JAR_DIR, OUT_JAR_DIR)
334
elif MODE == "--deoptimize":
335
deoptimize(JAR_LOG_DIR, IN_JAR_DIR, OUT_JAR_DIR)
337
print("Unknown mode %s" % MODE)
340
if __name__ == '__main__':