1
#!/usr/local/bin/python
2
# -*- coding: latin-1 -*-
5
Module to read Microsoft OLE2 files (also called Structured Storage or
6
Microsoft Compound Document File Format), such as Microsoft Office
7
documents, Image Composer and FlashPix files, Outlook messages, ...
9
version 0.21 2010-01-22 Philippe Lagadec - http://www.decalage.info
11
Project website: http://www.decalage.info/python/olefileio
13
Improved version of the OleFileIO module from PIL library v1.1.6
14
See: http://www.pythonware.com/products/pil/index.htm
16
The Python Imaging Library (PIL) is
17
Copyright (c) 1997-2005 by Secret Labs AB
18
Copyright (c) 1995-2005 by Fredrik Lundh
19
OleFileIO_PL changes are Copyright (c) 2005-2010 by Philippe Lagadec
21
See source code and LICENSE.txt for information on usage and redistribution.
23
WARNING: THIS IS (STILL) WORK IN PROGRESS.
26
__author__ = "Fredrik Lundh (Secret Labs AB), Philippe Lagadec"
27
__date__ = "2010-01-22"
30
#--- LICENSE ------------------------------------------------------------------
32
# OleFileIO_PL is an improved version of the OleFileIO module from the
33
# Python Imaging Library (PIL).
35
# OleFileIO_PL changes are Copyright (c) 2005-2010 by Philippe Lagadec
37
# The Python Imaging Library (PIL) is
38
# Copyright (c) 1997-2005 by Secret Labs AB
39
# Copyright (c) 1995-2005 by Fredrik Lundh
41
# By obtaining, using, and/or copying this software and/or its associated
42
# documentation, you agree that you have read, understood, and will comply with
43
# the following terms and conditions:
45
# Permission to use, copy, modify, and distribute this software and its
46
# associated documentation for any purpose and without fee is hereby granted,
47
# provided that the above copyright notice appears in all copies, and that both
48
# that copyright notice and this permission notice appear in supporting
49
# documentation, and that the name of Secret Labs AB or the author(s) not be used
50
# in advertising or publicity pertaining to distribution of the software
51
# without specific, written prior permission.
53
# SECRET LABS AB AND THE AUTHORS DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
54
# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
55
# IN NO EVENT SHALL SECRET LABS AB OR THE AUTHORS BE LIABLE FOR ANY SPECIAL,
56
# INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
57
# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
58
# OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
59
# PERFORMANCE OF THIS SOFTWARE.
61
#-----------------------------------------------------------------------------
62
# CHANGELOG: (only OleFileIO_PL changes compared to PIL 1.1.6)
63
# 2005-05-11 v0.10 PL: - a few fixes for Python 2.4 compatibility
64
# (all changes flagged with [PL])
65
# 2006-02-22 v0.11 PL: - a few fixes for some Office 2003 documents which raise
66
# exceptions in _OleStream.__init__()
67
# 2006-06-09 v0.12 PL: - fixes for files above 6.8MB (DIFAT in loadfat)
68
# - added some constants
69
# - added header values checks
70
# - added some docstrings
71
# - getsect: bugfix in case sectors >512 bytes
72
# - getsect: added conformity checks
73
# - DEBUG_MODE constant to activate debug display
74
# 2007-09-04 v0.13 PL: - improved/translated (lots of) comments
76
# - converted tabs to 4 spaces
77
# 2007-11-19 v0.14 PL: - added OleFileIO._raise_defect() to adapt sensitivity
78
# - improved _unicode() to use Python 2.x unicode support
79
# - fixed bug in _OleDirectoryEntry
80
# 2007-11-25 v0.15 PL: - added safety checks to detect FAT loops
81
# - fixed _OleStream which didn't check stream size
82
# - added/improved many docstrings and comments
83
# - moved helper functions _unicode and _clsid out of
85
# - improved OleFileIO._find() to add Unix path syntax
86
# - OleFileIO._find() is now case-insensitive
87
# - added get_type() and get_rootentry_name()
88
# - rewritten loaddirectory and _OleDirectoryEntry
89
# 2007-11-27 v0.16 PL: - added _OleDirectoryEntry.kids_dict
90
# - added detection of duplicate filenames in storages
91
# - added detection of duplicate references to streams
92
# - added get_size() and exists() to _OleDirectoryEntry
93
# - added isOleFile to check header before parsing
94
# - added __all__ list to control public keywords in pydoc
95
# 2007-12-04 v0.17 PL: - added _load_direntry to fix a bug in loaddirectory
96
# - improved _unicode(), added workarounds for Python <2.3
97
# - added set_debug_mode and -d option to set debug mode
98
# - fixed bugs in OleFileIO.open and _OleDirectoryEntry
99
# - added safety check in main for large or binary
101
# - allow size>0 for storages for some implementations
102
# 2007-12-05 v0.18 PL: - fixed several bugs in handling of FAT, MiniFAT and
104
# - added option '-c' in main to check all streams
105
# 2009-12-10 v0.19 PL: - bugfix for 32 bit arrays on 64 bits platforms
106
# (thanks to Ben G. and Martijn for reporting the bug)
107
# 2009-12-11 v0.20 PL: - bugfix in OleFileIO.open when filename is not plain str
108
# 2010-01-22 v0.21 PL: - added support for big-endian CPUs such as PowerPC Macs
110
#-----------------------------------------------------------------------------
111
# TODO (for version 1.0):
112
# - TESTS with Linux, MacOSX, Python 1.5.2, various files, PIL, ...
113
# - add underscore to each private method, to avoid their display in
114
# pydoc/epydoc documentation
115
# - replace all raised exceptions with _raise_defect (at least in OleFileIO)
116
# - merge code from _OleStream and OleFileIO.getsect to read sectors
117
# (maybe add a class for FAT and MiniFAT ?)
118
# - add method to check all streams (follow sectors chains without storing all
119
# stream in memory, and report anomalies)
120
# - use _OleDirectoryEntry.kids_dict to improve _find and _list ?
121
# - fix Unicode names handling (find some way to stay compatible with Py1.5.2)
122
# => if possible avoid converting names to Latin-1
123
# - review DIFAT code: fix handling of DIFSECT blocks in FAT (not stop)
124
# - rewrite OleFileIO.getproperties
125
# - improve docstrings to show more sample uses
126
# - see also original notes and FIXME below
127
# - remove all obsolete FIXMEs
130
# - allow _raise_defect to raise different exceptions, not only IOError
131
# - provide a class with named attributes to get well-known properties of
132
# MS Office documents (title, author, ...) ?
133
# - in OleFileIO._open and _OleStream, use size=None instead of 0x7FFFFFFF for
134
# streams with unknown size
135
# - use arrays of int instead of long integers for FAT/MiniFAT, to improve
136
# performance and reduce memory usage ? (possible issue with values >2^31)
137
# - provide tests with unittest (may need write support to create samples)
138
# - move all debug code (and maybe dump methods) to a separate module, with
139
# a class which inherits OleFileIO ?
140
# - fix docstrings to follow epydoc format
141
# - add support for 4K sectors ?
142
# - add support for big endian byte order ?
143
# - create a simple OLE explorer with wxPython
145
# FUTURE EVOLUTIONS to add write support:
146
# 1) add ability to write a stream back on disk from StringIO (same size, no
147
# change in FAT/MiniFAT).
148
# 2) rename a stream/storage if it doesn't change the RB tree
149
# 3) use rbtree module to update the red-black tree + any rename
150
# 4) remove a stream/storage: free sectors in FAT/MiniFAT
151
# 5) allocate new sectors in FAT/MiniFAT
152
# 6) create new storage/stream
153
#-----------------------------------------------------------------------------
156
# THIS IS WORK IN PROGRESS
158
# The Python Imaging Library
159
# $Id: OleFileIO.py 2339 2005-03-25 08:02:17Z fredrik $
161
# stuff to deal with OLE2 Structured Storage files. this module is
162
# used by PIL to read Image Composer and FlashPix files, but can also
163
# be used to read other files of this type.
166
# 1997-01-20 fl Created
167
# 1997-01-22 fl Fixed 64-bit portability quirk
168
# 2003-09-09 fl Fixed typo in OleFileIO.loadfat (noted by Daniel Haertle)
169
# 2004-02-29 fl Changed long hex constants to signed integers
172
# FIXME: sort out sign problem (eliminate long hex constants)
173
# FIXME: change filename to use "a/b/c" instead of ["a", "b", "c"]
174
# FIXME: provide a glob mechanism function (using fnmatchcase)
178
# "FlashPix Format Specification, Appendix A", Kodak and Microsoft,
183
# "If this document and functionality of the Software conflict,
184
# the actual functionality of the Software represents the correct
185
# functionality" -- Microsoft, in the OLE format specification
187
# Copyright (c) Secret Labs AB 1997.
188
# Copyright (c) Fredrik Lundh 1997.
190
# See the README file for information on usage and redistribution.
193
#------------------------------------------------------------------------------
195
import string, StringIO, struct, array, os.path, sys
197
#[PL] Define explicitly the public API to avoid private objects in pydoc:
198
__all__ = ['OleFileIO', 'isOleFile']
200
#[PL] workaround to fix an issue with array item size on 64 bits systems:
201
if array.array('L').itemsize == 4:
202
# on 32 bits platforms, long integers in an array are 32 bits:
204
elif array.array('I').itemsize == 4:
205
# on 64 bits platforms, integers in an array are 32 bits:
208
raise ValueError, 'Need to fix a bug with 32 bit arrays, please contact author...'
211
#[PL] These workarounds were inspired from the Path module
212
# (see http://www.jorendorff.com/articles/python/path/)
213
#TODO: test with old Python versions
215
# Pre-2.3 workaround for booleans
221
# Pre-2.3 workaround for basestring.
226
# is Unicode supported (Python >2.0 or >1.6 ?)
227
basestring = (str, unicode)
231
#[PL] Experimental setting: if True, OLE filenames will be kept in Unicode
232
# if False (default PIL behaviour), all filenames are converted to Latin-1.
233
KEEP_UNICODE_NAMES = False
235
#[PL] DEBUG display mode: False by default, use set_debug_mode() or "-d" on
236
# command line to change it.
238
def debug_print(msg):
244
def set_debug_mode(debug_mode):
246
Set debug mode on or off, to control display of debugging messages.
249
global DEBUG_MODE, debug
250
DEBUG_MODE = debug_mode
256
#TODO: convert this to hex
257
MAGIC = '\320\317\021\340\241\261\032\341'
259
#[PL]: added constants for Sector IDs (from AAF specifications)
260
MAXREGSECT = 0xFFFFFFFAL; # maximum SECT
261
DIFSECT = 0xFFFFFFFCL; # (-4) denotes a DIFAT sector in a FAT
262
FATSECT = 0xFFFFFFFDL; # (-3) denotes a FAT sector in a FAT
263
ENDOFCHAIN = 0xFFFFFFFEL; # (-2) end of a virtual stream chain
264
FREESECT = 0xFFFFFFFFL; # (-1) unallocated sector
266
#[PL]: added constants for Directory Entry IDs (from AAF specifications)
267
MAXREGSID = 0xFFFFFFFAL; # maximum directory entry ID
268
NOSTREAM = 0xFFFFFFFFL; # (-1) unallocated directory entry
270
#[PL] object types in storage (from AAF specifications)
271
STGTY_EMPTY = 0 # empty directory entry (according to OpenOffice.org doc)
272
STGTY_STORAGE = 1 # element is a storage object
273
STGTY_STREAM = 2 # element is a stream object
274
STGTY_LOCKBYTES = 3 # element is an ILockBytes object
275
STGTY_PROPERTY = 4 # element is an IPropertyStorage object
276
STGTY_ROOT = 5 # element is a root storage
280
# --------------------------------------------------------------------
283
VT_EMPTY=0; VT_NULL=1; VT_I2=2; VT_I4=3; VT_R4=4; VT_R8=5; VT_CY=6;
284
VT_DATE=7; VT_BSTR=8; VT_DISPATCH=9; VT_ERROR=10; VT_BOOL=11;
285
VT_VARIANT=12; VT_UNKNOWN=13; VT_DECIMAL=14; VT_I1=16; VT_UI1=17;
286
VT_UI2=18; VT_UI4=19; VT_I8=20; VT_UI8=21; VT_INT=22; VT_UINT=23;
287
VT_VOID=24; VT_HRESULT=25; VT_PTR=26; VT_SAFEARRAY=27; VT_CARRAY=28;
288
VT_USERDEFINED=29; VT_LPSTR=30; VT_LPWSTR=31; VT_FILETIME=64;
289
VT_BLOB=65; VT_STREAM=66; VT_STORAGE=67; VT_STREAMED_OBJECT=68;
290
VT_STORED_OBJECT=69; VT_BLOB_OBJECT=70; VT_CF=71; VT_CLSID=72;
293
# map property id to name (for debugging purposes)
296
for keyword, var in vars().items():
297
if keyword[:3] == "VT_":
301
# --------------------------------------------------------------------
302
# Some common document types (root.clsid fields)
304
WORD_CLSID = "00020900-0000-0000-C000-000000000046"
305
#TODO: check Excel, PPT, ...
307
#[PL]: Defect levels to classify parsing errors - see OleFileIO._raise_defect()
308
DEFECT_UNSURE = 10 # a case which looks weird, but not sure it's a defect
309
DEFECT_POTENTIAL = 20 # a potential defect
310
DEFECT_INCORRECT = 30 # an error according to specifications, but parsing
312
DEFECT_FATAL = 40 # an error which cannot be ignored, parsing is
315
#[PL] add useful constants to __all__:
316
for key in vars().keys():
317
if key.startswith('STGTY_') or key.startswith('DEFECT_'):
321
#--- FUNCTIONS ----------------------------------------------------------------
323
def isOleFile (filename):
325
Test if file is an OLE container (according to its header).
326
filename: file name or path (str, unicode)
327
return: True if OLE, False otherwise.
329
f = open(filename, 'rb')
330
header = f.read(len(MAGIC))
337
#TODO: replace i16 and i32 with more readable struct.unpack equivalent
340
Converts a 2-bytes (16 bits) string to an integer.
342
c: string containing bytes to convert
343
o: offset of bytes to convert in string
345
return ord(c[o])+(ord(c[o+1])<<8)
350
Converts a 4-bytes (32 bits) string to an integer.
352
c: string containing bytes to convert
353
o: offset of bytes to convert in string
355
return int(ord(c[o])+(ord(c[o+1])<<8)+(ord(c[o+2])<<16)+(ord(c[o+3])<<24))
356
# [PL]: added int() because "<<" gives long int since Python 2.4
361
Converts a CLSID to a human-readable string.
362
clsid: string of length 16.
364
assert len(clsid) == 16
365
if clsid == "\0" * len(clsid):
367
return (("%08X-%04X-%04X-%02X%02X-" + "%02X" * 6) %
368
((i32(clsid, 0), i16(clsid, 4), i16(clsid, 6)) +
369
tuple(map(ord, clsid[8:16]))))
373
# UNICODE support for Old Python versions:
374
# (necessary to handle storages/streams names which use Unicode)
377
# is Unicode supported ?
380
def _unicode(s, errors='replace'):
382
Map unicode string to Latin 1. (Python with Unicode support)
384
s: UTF-16LE unicode string to convert to Latin-1
385
errors: 'replace', 'ignore' or 'strict'. See Python doc for unicode()
387
#TODO: test if it OleFileIO works with Unicode strings, instead of
388
# converting to Latin-1.
390
# First the string is converted to plain Unicode:
391
# (assuming it is encoded as UTF-16 little-endian)
392
u = s.decode('UTF-16LE', errors)
393
if KEEP_UNICODE_NAMES:
396
# Second the unicode string is converted to Latin-1
397
return u.encode('latin_1', errors)
399
# there was an error during Unicode to Latin-1 conversion:
400
raise IOError, 'incorrect Unicode name'
403
def _unicode(s, errors='replace'):
405
Map unicode string to Latin 1. (Python without native Unicode support)
407
s: UTF-16LE unicode string to convert to Latin-1
408
errors: 'replace', 'ignore' or 'strict'. (ignored in this version)
410
# If the unicode function does not exist, we assume this is an old
411
# Python version without Unicode support.
412
# Null bytes are simply removed (this only works with usual Latin-1
413
# strings which do not contain unicode characters>256):
414
return filter(ord, s)
419
#=== CLASSES ==================================================================
421
#--- _OleStream ---------------------------------------------------------------
423
class _OleStream(StringIO.StringIO):
427
Returns a read-only file object which can be used to read
428
the contents of a OLE stream (instance of the StringIO class).
429
To open a stream, use the openstream method in the OleFile class.
431
This function can be used with either ordinary streams,
432
or ministreams, depending on the offset, sectorsize, and
436
- size: actual size of data stream, after it was opened.
439
# FIXME: should store the list of sects obtained by following
440
# the fat chain, and load new sectors on demand instead of
441
# loading it all in one go.
443
def __init__(self, fp, sect, size, offset, sectorsize, fat):
445
Constructor for _OleStream class.
447
fp : file object, the OLE container or the MiniFAT stream
448
sect : sector index of first sector in the stream
449
size : total size of the stream
450
offset : offset in bytes for the first FAT or MiniFAT sector
451
sectorsize: size of one sector
452
fat : array/list of sector indexes (FAT or MiniFAT)
453
return : a StringIO instance containing the OLE stream
455
debug('_OleStream.__init__:')
456
debug(' sect=%d (%X), size=%d, offset=%d, sectorsize=%d, len(fat)=%d, fp=%s'
457
%(sect,sect,size,offset,sectorsize,len(fat), repr(fp)))
458
# for debugging messages, size of file where stream is read:
459
if isinstance(fp, StringIO.StringIO):
460
filesize = len(fp.getvalue()) # file in MiniFAT
462
filesize = os.path.getsize(fp.name) # file on disk
463
#[PL] To detect malformed documents with FAT loops, we compute the
464
# expected number of sectors in the stream:
467
# this is the case when called from OleFileIO._open(), and stream
468
# size is not known in advance (for example when reading the
469
# Directory stream). Then we can only guess maximum size:
470
size = len(fat)*sectorsize
471
# and we keep a record that size was unknown:
473
debug(' stream with UNKNOWN SIZE')
474
nb_sectors = (size + (sectorsize-1)) / sectorsize
475
debug('nb_sectors = %d' % nb_sectors)
476
# This number should (at least) be less than the total number of
477
# sectors in the given FAT:
478
if nb_sectors > len(fat):
479
raise IOError, 'malformed OLE document, stream too large'
480
# optimization(?): data is first a list of strings, and join() is called
481
# at the end to concatenate all in one string.
482
# (this may not be really useful with recent Python versions)
484
# if size is zero, then first sector index should be ENDOFCHAIN:
485
if size == 0 and sect != ENDOFCHAIN:
486
debug('size == 0 and sect != ENDOFCHAIN:')
487
raise IOError, 'incorrect OLE sector index for empty stream'
488
#[PL] A fixed-length for loop is used instead of an undefined while
489
# loop to avoid DoS attacks:
490
for i in xrange(nb_sectors):
491
# Sector index may be ENDOFCHAIN, but only if size was unknown
492
if sect == ENDOFCHAIN:
496
# else this means that the stream is smaller than declared:
497
debug('sect=ENDOFCHAIN before expected size')
498
raise IOError, 'incomplete OLE stream'
499
# sector index should be within FAT:
500
if sect<0 or sect>=len(fat):
501
debug('sect=%d (%X) / len(fat)=%d' % (sect, sect, len(fat)))
502
debug('i=%d / nb_sectors=%d' %(i, nb_sectors))
503
## tmp_data = string.join(data, "")
504
## f = open('test_debug.bin', 'wb')
507
## debug('data read so far: %d bytes' % len(tmp_data))
508
raise IOError, 'incorrect OLE FAT, sector index out of range'
509
#TODO: merge this code with OleFileIO.getsect() ?
510
#TODO: check if this works with 4K sectors:
512
fp.seek(offset + sectorsize * sect)
514
debug('sect=%d, seek=%d, filesize=%d' %
515
(sect, offset+sectorsize*sect, filesize))
516
raise IOError, 'OLE sector index out of range'
517
sector_data = fp.read(sectorsize)
518
# [PL] check if there was enough data:
519
# Note: if sector is the last of the file, sometimes it is not a
520
# complete sector (of 512 or 4K), so we may read less than
522
if len(sector_data)!=sectorsize and sect!=(len(fat)-1):
523
debug('sect=%d / len(fat)=%d, seek=%d / filesize=%d, len read=%d' %
524
(sect, len(fat), offset+sectorsize*sect, filesize, len(sector_data)))
525
debug('seek+len(read)=%d' % (offset+sectorsize*sect+len(sector_data)))
526
raise IOError, 'incomplete OLE sector'
527
data.append(sector_data)
528
# jump to next sector in the FAT:
532
# [PL] if pointer is out of the FAT an exception is raised
533
raise IOError, 'incorrect OLE FAT, sector index out of range'
534
#[PL] Last sector should be a "end of chain" marker:
535
if sect != ENDOFCHAIN:
536
raise IOError, 'incorrect last sector index in OLE stream'
537
data = string.join(data, "")
538
# Data is truncated to the actual stream size:
539
if len(data) >= size:
541
# actual stream size is stored for future use:
544
# actual stream size was not known, now we know the size of read
546
self.size = len(data)
548
# read data is less than expected:
549
debug('len(data)=%d, size=%d' % (len(data), size))
550
raise IOError, 'OLE stream size is less than declared'
551
# when all data is read in memory, StringIO constructor is called
552
StringIO.StringIO.__init__(self, data)
553
# Then the _OleStream object can be used as a read-only file object.
556
#--- _OleDirectoryEntry -------------------------------------------------------
558
class _OleDirectoryEntry:
563
#[PL] parsing code moved from OleFileIO.loaddirectory
565
# struct to parse directory entries:
566
# <: little-endian byte order
567
# 64s: string containing entry name in unicode (max 31 chars) + null char
568
# H: uint16, number of bytes used in name buffer, including null = (len+1)*2
569
# B: uint8, dir entry type (between 0 and 5)
570
# B: uint8, color: 0=black, 1=red
571
# I: uint32, index of left child node in the red-black tree, NOSTREAM if none
572
# I: uint32, index of right child node in the red-black tree, NOSTREAM if none
573
# I: uint32, index of child root node if it is a storage, else NOSTREAM
574
# 16s: CLSID, unique identifier (only used if it is a storage)
575
# I: uint32, user flags
576
# 8s: uint64, creation timestamp or zero
577
# 8s: uint64, modification timestamp or zero
578
# I: uint32, SID of first sector if stream or ministream, SID of 1st sector
579
# of stream containing ministreams if root entry, 0 otherwise
580
# I: uint32, total stream size in bytes if stream (low 32 bits), 0 otherwise
581
# I: uint32, total stream size in bytes if stream (high 32 bits), 0 otherwise
582
STRUCT_DIRENTRY = '<64sHBBIII16sI8s8sIII'
583
# size of a directory entry: 128 bytes
585
assert struct.calcsize(STRUCT_DIRENTRY) == DIRENTRY_SIZE
588
def __init__(self, entry, sid, olefile):
590
Constructor for an _OleDirectoryEntry object.
591
Parses a 128-bytes entry from the OLE Directory stream.
593
entry : string (must be 128 bytes long)
594
sid : index of this directory entry in the OLE file directory
595
olefile: OleFileIO containing this directory entry
598
# ref to olefile is stored for future use
599
self.olefile = olefile
600
# kids is a list of children entries, if this entry is a storage:
601
# (list of _OleDirectoryEntry objects)
603
# kids_dict is a dictionary of children entries, indexed by their
604
# name in lowercase: used to quickly find an entry, and to detect
607
# flag used to detect if the entry is referenced more than once in
626
) = struct.unpack(_OleDirectoryEntry.STRUCT_DIRENTRY, entry)
627
if self.entry_type not in [STGTY_ROOT, STGTY_STORAGE, STGTY_STREAM, STGTY_EMPTY]:
628
olefile._raise_defect(DEFECT_INCORRECT, 'unhandled OLE storage type')
629
# only first directory entry can (and should) be root:
630
if self.entry_type == STGTY_ROOT and sid != 0:
631
olefile._raise_defect(DEFECT_INCORRECT, 'duplicate OLE root entry')
632
if sid == 0 and self.entry_type != STGTY_ROOT:
633
olefile._raise_defect(DEFECT_INCORRECT, 'incorrect OLE root entry')
634
#debug (struct.unpack(fmt_entry, entry[:len_entry]))
635
# name should be at most 31 unicode characters + null character,
636
# so 64 bytes in total (31*2 + 2):
638
olefile._raise_defect(DEFECT_INCORRECT, 'incorrect DirEntry name length')
639
# if exception not raised, namelength is set to the maximum value:
641
# only characters without ending null char are kept:
642
name = name[:(namelength-2)]
643
# name is converted from unicode to Latin-1:
644
self.name = _unicode(name)
646
debug('DirEntry SID=%d: %s' % (self.sid, repr(self.name)))
647
debug(' - type: %d' % self.entry_type)
648
debug(' - sect: %d' % self.isectStart)
649
debug(' - SID left: %d, right: %d, child: %d' % (self.sid_left,
650
self.sid_right, self.sid_child))
652
# sizeHigh is only used for 4K sectors, it should be zero for 512 bytes
653
# sectors, BUT apparently some implementations set it as 0xFFFFFFFFL, 1
654
# or some other value so it cannot be raised as a defect in general:
655
if olefile.sectorsize == 512:
656
if sizeHigh != 0 and sizeHigh != 0xFFFFFFFFL:
657
debug('sectorsize=%d, sizeLow=%d, sizeHigh=%d (%X)' %
658
(olefile.sectorsize, sizeLow, sizeHigh, sizeHigh))
659
olefile._raise_defect(DEFECT_UNSURE, 'incorrect OLE stream size')
662
self.size = sizeLow + (long(sizeHigh)<<32)
663
debug(' - size: %d (sizeLow=%d, sizeHigh=%d)' % (self.size, sizeLow, sizeHigh))
665
self.clsid = _clsid(clsid)
666
# a storage should have a null size, BUT some implementations such as
667
# Word 8 for Mac seem to allow non-null values => Potential defect:
668
if self.entry_type == STGTY_STORAGE and self.size != 0:
669
olefile._raise_defect(DEFECT_POTENTIAL, 'OLE storage with size>0')
670
# check if stream is not already referenced elsewhere:
671
if self.entry_type in (STGTY_ROOT, STGTY_STREAM) and self.size>0:
672
if self.size < olefile.minisectorcutoff \
673
and self.entry_type==STGTY_STREAM: # only streams can be in MiniFAT
678
olefile._check_duplicate_stream(self.isectStart, minifat)
682
def build_storage_tree(self):
684
Read and build the red-black tree attached to this _OleDirectoryEntry
685
object, if it is a storage.
686
Note that this method builds a tree of all subentries, so it should
687
only be called for the root object once.
689
debug('build_storage_tree: SID=%d - %s - sid_child=%d'
690
% (self.sid, repr(self.name), self.sid_child))
691
if self.sid_child != NOSTREAM:
692
# if child SID is not NOSTREAM, then this entry is a storage.
693
# Let's walk through the tree of children to fill the kids list:
694
self.append_kids(self.sid_child)
696
# Note from OpenOffice documentation: the safest way is to
697
# recreate the tree because some implementations may store broken
700
# in the OLE file, entries are sorted on (length, name).
701
# for convenience, we sort them on name instead:
702
# (see __cmp__ method in this class)
706
def append_kids(self, child_sid):
708
Walk through red-black tree of children of this directory entry to add
709
all of them to the kids list. (recursive method)
711
child_sid : index of child directory entry to use, or None when called
712
first time for the root. (only used during recursion)
714
#[PL] this method was added to use simple recursion instead of a complex
716
# if this is not a storage or a leaf of the tree, nothing to do:
717
if child_sid == NOSTREAM:
719
# check if child SID is in the proper range:
720
if child_sid<0 or child_sid>=len(self.olefile.direntries):
721
self.olefile._raise_defect(DEFECT_FATAL, 'OLE DirEntry index out of range')
722
# get child direntry:
723
child = self.olefile._load_direntry(child_sid) #direntries[child_sid]
724
debug('append_kids: child_sid=%d - %s - sid_left=%d, sid_right=%d, sid_child=%d'
725
% (child.sid, repr(child.name), child.sid_left, child.sid_right, child.sid_child))
726
# the directory entries are organized as a red-black tree.
727
# (cf. Wikipedia for details)
728
# First walk through left side of the tree:
729
self.append_kids(child.sid_left)
730
# Check if its name is not already used (case-insensitive):
731
name_lower = child.name.lower()
732
if self.kids_dict.has_key(name_lower):
733
self.olefile._raise_defect(DEFECT_INCORRECT,
734
"Duplicate filename in OLE storage")
735
# Then the child_sid _OleDirectoryEntry object is appended to the
736
# kids list and dictionary:
737
self.kids.append(child)
738
self.kids_dict[name_lower] = child
739
# Check if kid was not already referenced in a storage:
741
self.olefile._raise_defect(DEFECT_INCORRECT,
742
'OLE Entry referenced more than once')
744
# Finally walk through right side of the tree:
745
self.append_kids(child.sid_right)
746
# Afterwards build kid's own tree if it's also a storage:
747
child.build_storage_tree()
750
def __cmp__(self, other):
751
"Compare entries by name"
752
return cmp(self.name, other.name)
753
#TODO: replace by the same function as MS implementation ?
754
# (order by name length first, then case-insensitive order)
757
def dump(self, tab = 0):
758
"Dump this entry, and all its subentries (for debug purposes only)"
759
TYPES = ["(invalid)", "(storage)", "(stream)", "(lockbytes)",
760
"(property)", "(root)"]
761
print " "*tab + repr(self.name), TYPES[self.entry_type],
762
if self.entry_type in (STGTY_STREAM, STGTY_ROOT):
763
print self.size, "bytes",
765
if self.entry_type in (STGTY_STORAGE, STGTY_ROOT) and self.clsid:
766
print " "*tab + "{%s}" % self.clsid
768
for kid in self.kids:
772
#--- OleFileIO ----------------------------------------------------------------
778
This class encapsulates the interface to an OLE 2 structured
779
storage file. Use the {@link listdir} and {@link openstream} methods to
780
access the contents of this file.
782
Object names are given as a list of strings, one for each subentry
783
level. The root entry should be omitted. For example, the following
784
code extracts all image streams from a Microsoft Image Composer file:
786
ole = OleFileIO("fan.mic")
788
for entry in ole.listdir():
789
if entry[1:2] == "Image":
790
fin = ole.openstream(entry)
791
fout = open(entry[0:1], "wb")
798
You can use the viewer application provided with the Python Imaging
799
Library to view the resulting files (which happens to be standard
803
def __init__(self, filename = None, raise_defects=DEFECT_FATAL):
805
Constructor for OleFileIO class.
807
filename: file to open.
808
raise_defects: minimal level for defects to be raised as exceptions.
809
(use DEFECT_FATAL for a typical application, DEFECT_INCORRECT for a
810
security-oriented application, see source code for details)
812
self._raise_defects_level = raise_defects
817
def _raise_defect(self, defect_level, message):
819
This method should be called for any defect found during file parsing.
820
It may raise an IOError exception according to the minimal level chosen
821
for the OleFileIO object.
823
defect_level: defect level, possible values are:
824
DEFECT_UNSURE : a case which looks weird, but not sure it's a defect
825
DEFECT_POTENTIAL : a potential defect
826
DEFECT_INCORRECT : an error according to specifications, but parsing can go on
827
DEFECT_FATAL : an error which cannot be ignored, parsing is impossible
828
message: string describing the defect, used with raised exception.
831
if defect_level >= self._raise_defects_level:
832
raise IOError, message
835
def open(self, filename):
838
Reads the header, FAT and directory.
840
filename: string-like or file-like object
842
#[PL] check if filename is a string-like or file-like object:
843
# (it is better to check for a read() method)
844
if hasattr(filename, 'read'):
849
self.fp = open(filename, "rb")
850
# old code fails if filename is not a plain string:
851
#if type(filename) == type(""):
852
# self.fp = open(filename, "rb")
856
# lists of streams in FAT and MiniFAT, to detect duplicate references
857
# (list of indexes of first sectors of each stream)
858
self._used_streams_fat = []
859
self._used_streams_minifat = []
861
header = self.fp.read(512)
863
if len(header) != 512 or header[:8] != MAGIC:
864
self._raise_defect(DEFECT_FATAL, "not an OLE2 structured storage file")
866
# [PL] header structure according to AAF specifications:
868
##struct StructuredStorageHeader { // [offset from start (bytes), length (bytes)]
869
##BYTE _abSig[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1,
870
## // 0x1a, 0xe1} for current version
871
##CLSID _clsid; // [08H,16] reserved must be zero (WriteClassStg/
872
## // GetClassFile uses root directory class id)
873
##USHORT _uMinorVersion; // [18H,02] minor version of the format: 33 is
874
## // written by reference implementation
875
##USHORT _uDllVersion; // [1AH,02] major version of the dll/format: 3 for
876
## // 512-byte sectors, 4 for 4 KB sectors
877
##USHORT _uByteOrder; // [1CH,02] 0xFFFE: indicates Intel byte-ordering
878
##USHORT _uSectorShift; // [1EH,02] size of sectors in power-of-two;
879
## // typically 9 indicating 512-byte sectors
880
##USHORT _uMiniSectorShift; // [20H,02] size of mini-sectors in power-of-two;
881
## // typically 6 indicating 64-byte mini-sectors
882
##USHORT _usReserved; // [22H,02] reserved, must be zero
883
##ULONG _ulReserved1; // [24H,04] reserved, must be zero
884
##FSINDEX _csectDir; // [28H,04] must be zero for 512-byte sectors,
885
## // number of SECTs in directory chain for 4 KB
887
##FSINDEX _csectFat; // [2CH,04] number of SECTs in the FAT chain
888
##SECT _sectDirStart; // [30H,04] first SECT in the directory chain
889
##DFSIGNATURE _signature; // [34H,04] signature used for transactions; must
890
## // be zero. The reference implementation
891
## // does not support transactions
892
##ULONG _ulMiniSectorCutoff; // [38H,04] maximum size for a mini stream;
893
## // typically 4096 bytes
894
##SECT _sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain
895
##FSINDEX _csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain
896
##SECT _sectDifStart; // [44H,04] first SECT in the DIFAT chain
897
##FSINDEX _csectDif; // [48H,04] number of SECTs in the DIFAT chain
898
##SECT _sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors
901
# [PL] header decoding:
902
# '<' indicates little-endian byte ordering for Intel (cf. struct module help)
903
fmt_header = '<8s16sHHHHHHLLLLLLLLLL'
904
header_size = struct.calcsize(fmt_header)
905
debug( "fmt_header size = %d, +FAT = %d" % (header_size, header_size + 109*4) )
906
header1 = header[:header_size]
914
self.MiniSectorShift,
915
self.Reserved, self.Reserved1,
920
self.MiniSectorCutoff,
925
) = struct.unpack(fmt_header, header1)
926
debug( struct.unpack(fmt_header, header1))
928
if self.Sig != '\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1':
929
# OLE signature should always be present
930
self._raise_defect(DEFECT_FATAL, "incorrect OLE signature")
931
if self.clsid != '\x00'*16:
932
# according to AAF specs, CLSID should always be zero
933
self._raise_defect(DEFECT_INCORRECT, "incorrect CLSID in OLE header")
934
debug( "MinorVersion = %d" % self.MinorVersion )
935
debug( "DllVersion = %d" % self.DllVersion )
936
if self.DllVersion not in [3, 4]:
937
# version 3: usual format, 512 bytes per sector
938
# version 4: large format, 4K per sector
939
self._raise_defect(DEFECT_INCORRECT, "incorrect DllVersion in OLE header")
940
debug( "ByteOrder = %X" % self.ByteOrder )
941
if self.ByteOrder != 0xFFFE:
942
# For now only common little-endian documents are handled correctly
943
self._raise_defect(DEFECT_FATAL, "incorrect ByteOrder in OLE header")
944
# TODO: add big-endian support for documents created on Mac ?
945
self.SectorSize = 2**self.SectorShift
946
debug( "SectorSize = %d" % self.SectorSize )
947
if self.SectorSize not in [512, 4096]:
948
self._raise_defect(DEFECT_INCORRECT, "incorrect SectorSize in OLE header")
949
if (self.DllVersion==3 and self.SectorSize!=512) \
950
or (self.DllVersion==4 and self.SectorSize!=4096):
951
self._raise_defect(DEFECT_INCORRECT, "SectorSize does not match DllVersion in OLE header")
952
self.MiniSectorSize = 2**self.MiniSectorShift
953
debug( "MiniSectorSize = %d" % self.MiniSectorSize )
954
if self.MiniSectorSize not in [64]:
955
self._raise_defect(DEFECT_INCORRECT, "incorrect MiniSectorSize in OLE header")
956
if self.Reserved != 0 or self.Reserved1 != 0:
957
self._raise_defect(DEFECT_INCORRECT, "incorrect OLE header (non-null reserved bytes)")
958
debug( "csectDir = %d" % self.csectDir )
959
if self.SectorSize==512 and self.csectDir!=0:
960
self._raise_defect(DEFECT_INCORRECT, "incorrect csectDir in OLE header")
961
debug( "csectFat = %d" % self.csectFat )
962
debug( "sectDirStart = %X" % self.sectDirStart )
963
debug( "signature = %d" % self.signature )
964
# Signature should be zero, BUT some implementations do not follow this
965
# rule => only a potential defect:
966
if self.signature != 0:
967
self._raise_defect(DEFECT_POTENTIAL, "incorrect OLE header (signature>0)")
968
debug( "MiniSectorCutoff = %d" % self.MiniSectorCutoff )
969
debug( "MiniFatStart = %X" % self.MiniFatStart )
970
debug( "csectMiniFat = %d" % self.csectMiniFat )
971
debug( "sectDifStart = %X" % self.sectDifStart )
972
debug( "csectDif = %d" % self.csectDif )
974
# calculate the number of sectors in the file
975
# (-1 because header doesn't count)
976
filesize = os.path.getsize(filename)
977
self.nb_sect = ( (filesize + self.SectorSize-1) / self.SectorSize) - 1
978
debug( "Number of sectors in the file: %d" % self.nb_sect )
980
# file clsid (probably never used, so we don't store it)
981
clsid = _clsid(header[8:24])
982
self.sectorsize = self.SectorSize #1 << i16(header, 30)
983
self.minisectorsize = self.MiniSectorSize #1 << i16(header, 32)
984
self.minisectorcutoff = self.MiniSectorCutoff # i32(header, 56)
986
# check known streams for duplicate references (these are always in FAT,
988
self._check_duplicate_stream(self.sectDirStart)
989
# check MiniFAT only if it is not empty:
990
if self.csectMiniFat:
991
self._check_duplicate_stream(self.MiniFatStart)
992
# check DIFAT only if it is not empty:
994
self._check_duplicate_stream(self.sectDifStart)
996
# Load file allocation tables
998
# Load direcory. This sets both the direntries list (ordered by sid)
999
# and the root (ordered by hierarchy) members.
1000
self.loaddirectory(self.sectDirStart)#i32(header, 48))
1001
self.ministream = None
1002
self.minifatsect = self.MiniFatStart #i32(header, 60)
1005
def _check_duplicate_stream(self, first_sect, minifat=False):
1007
Checks if a stream has not been already referenced elsewhere.
1008
This method should only be called once for each known stream, and only
1009
if stream size is not null.
1010
first_sect: index of first sector of the stream in FAT
1011
minifat: if True, stream is located in the MiniFAT, else in the FAT
1014
debug('_check_duplicate_stream: sect=%d in MiniFAT' % first_sect)
1015
used_streams = self._used_streams_minifat
1017
debug('_check_duplicate_stream: sect=%d in FAT' % first_sect)
1018
# some values can be safely ignored (not a real stream):
1019
if first_sect in (DIFSECT,FATSECT,ENDOFCHAIN,FREESECT):
1021
used_streams = self._used_streams_fat
1022
#TODO: would it be more efficient using a dict or hash values, instead
1023
# of a list of long ?
1024
if first_sect in used_streams:
1025
self._raise_defect(DEFECT_INCORRECT, 'Stream referenced twice')
1027
used_streams.append(first_sect)
1030
def dumpfat(self, fat, firstindex=0):
1031
"Displays a part of FAT in human-readable form for debugging purpose"
1032
# [PL] added only for debug
1035
# dictionary to convert special FAT values in human-readable strings
1036
VPL=8 # valeurs par ligne (8+1 * 8+1 = 81)
1038
FREESECT: "..free..",
1039
ENDOFCHAIN: "[ END. ]",
1040
FATSECT: "FATSECT ",
1044
nlines = (nbsect+VPL-1)/VPL
1046
for i in range(VPL):
1049
for l in range(nlines):
1051
print ("%8X:" % (firstindex+index)),
1052
for i in range(index, index+VPL):
1056
if sect in fatnames:
1057
nom = fatnames[sect]
1067
def dumpsect(self, sector, firstindex=0):
1068
"Displays a sector in a human-readable form, for debugging purpose."
1071
VPL=8 # number of values per line (8+1 * 8+1 = 81)
1072
tab = array.array(UINT32, sector)
1074
nlines = (nbsect+VPL-1)/VPL
1076
for i in range(VPL):
1079
for l in range(nlines):
1081
print ("%8X:" % (firstindex+index)),
1082
for i in range(index, index+VPL):
1090
def sect2array(self, sect):
1092
convert a sector to an array of 32 bits unsigned integers,
1093
swapping bytes on big endian CPUs such as PowerPC (old Macs)
1095
a = array.array(UINT32, sect)
1096
# if CPU is big endian, swap bytes:
1097
if sys.byteorder == 'big':
1102
def loadfat_sect(self, sect):
1104
Adds the indexes of the given sector to the FAT
1105
sect: string containing the first FAT sector, or array of long integers
1106
return: index of last FAT sector.
1108
# a FAT sector is an array of ulong integers.
1109
if isinstance(sect, array.array):
1110
# if sect is already an array it is directly used
1113
# if it's a raw sector, it is parsed in an array
1114
fat1 = self.sect2array(sect)
1116
# The FAT is a sector chain starting at the first index of itself.
1118
#print "isect = %X" % isect
1119
if isect == ENDOFCHAIN or isect == FREESECT:
1120
# the end of the sector chain has been reached
1122
# read the FAT sector
1123
s = self.getsect(isect)
1124
# parse it as an array of 32 bits integers, and add it to the
1126
nextfat = self.sect2array(s)
1127
self.fat = self.fat + nextfat
1131
def loadfat(self, header):
1135
# The header contains a sector numbers
1136
# for the first 109 FAT sectors. Additional sectors are
1137
# described by DIF blocks
1139
sect = header[76:512]
1140
debug( "len(sect)=%d, so %d integers" % (len(sect), len(sect)/4) )
1142
# [PL] FAT is an array of 32 bits unsigned ints, it's more effective
1143
# to use an array than a list in Python.
1144
# It's initialized as empty first:
1145
self.fat = array.array(UINT32)
1146
self.loadfat_sect(sect)
1147
#self.dumpfat(self.fat)
1148
## for i in range(0, len(sect), 4):
1149
## ix = i32(sect, i)
1150
## #[PL] if ix == -2 or ix == -1: # ix == 0xFFFFFFFEL or ix == 0xFFFFFFFFL:
1151
## if ix == 0xFFFFFFFEL or ix == 0xFFFFFFFFL:
1153
## s = self.getsect(ix)
1154
## #fat = fat + map(lambda i, s=s: i32(s, i), range(0, len(s), 4))
1155
## fat = fat + array.array(UINT32, s)
1156
if self.csectDif != 0:
1157
# [PL] There's a DIFAT because file is larger than 6.8MB
1158
# some checks just in case:
1159
if self.csectFat <= 109:
1160
# there must be at least 109 blocks in header and the rest in
1161
# DIFAT, so number of sectors must be >109.
1162
self._raise_defect(DEFECT_INCORRECT, 'incorrect DIFAT, not enough sectors')
1163
if self.sectDifStart >= self.nb_sect:
1164
# initial DIFAT block index must be valid
1165
self._raise_defect(DEFECT_FATAL, 'incorrect DIFAT, first index out of range')
1166
debug( "DIFAT analysis..." )
1167
# We compute the necessary number of DIFAT sectors :
1168
# (each DIFAT sector = 127 pointers + 1 towards next DIFAT sector)
1169
nb_difat = (self.csectFat-109 + 126)/127
1170
debug( "nb_difat = %d" % nb_difat )
1171
if self.csectDif != nb_difat:
1172
raise IOError, 'incorrect DIFAT'
1173
isect_difat = self.sectDifStart
1174
for i in xrange(nb_difat):
1175
debug( "DIFAT block %d, sector %X" % (i, isect_difat) )
1176
#TODO: check if corresponding FAT SID = DIFSECT
1177
sector_difat = self.getsect(isect_difat)
1178
difat = self.sect2array(sector_difat)
1179
self.dumpsect(sector_difat)
1180
self.loadfat_sect(difat[:127])
1181
# last DIFAT pointer is next DIFAT sector:
1182
isect_difat = difat[127]
1183
debug( "next DIFAT sector: %X" % isect_difat )
1185
if isect_difat not in [ENDOFCHAIN, FREESECT]:
1186
# last DIFAT pointer value must be ENDOFCHAIN or FREESECT
1187
raise IOError, 'incorrect end of DIFAT'
1188
## if len(self.fat) != self.csectFat:
1189
## # FAT should contain csectFat blocks
1190
## print "FAT length: %d instead of %d" % (len(self.fat), self.csectFat)
1191
## raise IOError, 'incorrect DIFAT'
1192
# since FAT is read from fixed-size sectors, it may contain more values
1193
# than the actual number of sectors in the file.
1194
# Keep only the relevant sector indexes:
1195
if len(self.fat) > self.nb_sect:
1196
debug('len(fat)=%d, shrunk to nb_sect=%d' % (len(self.fat), self.nb_sect))
1197
self.fat = self.fat[:self.nb_sect]
1199
self.dumpfat(self.fat)
1202
def loadminifat(self):
1204
Load the MiniFAT table.
1206
# MiniFAT is stored in a standard sub-stream, pointed to by a header
1208
# NOTE: there are two sizes to take into account for this stream:
1209
# 1) Stream size is calculated according to the number of sectors
1210
# declared in the OLE header. This allocated stream may be more than
1211
# needed to store the actual sector indexes.
1212
# (self.csectMiniFat is the number of sectors of size self.SectorSize)
1213
stream_size = self.csectMiniFat * self.SectorSize
1214
# 2) Actually used size is calculated by dividing the MiniStream size
1215
# (given by root entry size) by the size of mini sectors, *4 for
1217
nb_minisectors = (self.root.size + self.MiniSectorSize-1) / self.MiniSectorSize
1218
used_size = nb_minisectors * 4
1219
debug('loadminifat(): minifatsect=%d, nb FAT sectors=%d, used_size=%d, stream_size=%d, nb MiniSectors=%d' %
1220
(self.minifatsect, self.csectMiniFat, used_size, stream_size, nb_minisectors))
1221
if used_size > stream_size:
1222
# This is not really a problem, but may indicate a wrong implementation:
1223
self._raise_defect(DEFECT_INCORRECT, 'OLE MiniStream is larger than MiniFAT')
1224
# In any case, first read stream_size:
1225
s = self._open(self.minifatsect, stream_size, force_FAT=True).read()
1226
#[PL] Old code replaced by an array:
1227
#self.minifat = map(lambda i, s=s: i32(s, i), range(0, len(s), 4))
1228
self.minifat = self.sect2array(s)
1229
# Then shrink the array to used size, to avoid indexes out of MiniStream:
1230
debug('MiniFAT shrunk from %d to %d sectors' % (len(self.minifat), nb_minisectors))
1231
self.minifat = self.minifat[:nb_minisectors]
1232
debug('loadminifat(): len=%d' % len(self.minifat))
1234
self.dumpfat(self.minifat)
1236
def getsect(self, sect):
1238
Read given sector from file on disk.
1240
returns a string containing the sector data.
1242
# [PL] this original code was wrong when sectors are 4KB instead of
1244
#self.fp.seek(512 + self.sectorsize * sect)
1245
#[PL]: added safety checks:
1246
#print "getsect(%X)" % sect
1248
self.fp.seek(self.sectorsize * (sect+1))
1250
debug('getsect(): sect=%X, seek=%d, filesize=%d' %
1251
(sect, self.sectorsize*(sect+1), os.path.getsize(self.fp.name)))
1252
self._raise_defect(DEFECT_FATAL, 'OLE sector index out of range')
1253
sector = self.fp.read(self.sectorsize)
1254
if len(sector) != self.sectorsize:
1255
debug('getsect(): sect=%X, read=%d, sectorsize=%d' %
1256
(sect, len(sector), self.sectorsize))
1257
self._raise_defect(DEFECT_FATAL, 'incomplete OLE sector')
1261
def loaddirectory(self, sect):
1264
sect: sector index of directory stream.
1266
# The directory is stored in a standard
1267
# substream, independent of its size.
1269
# open directory stream as a read-only file:
1270
# (stream size is not known in advance)
1271
self.directory_fp = self._open(sect)
1273
#[PL] to detect malformed documents and avoid DoS attacks, the maximum
1274
# number of directory entries can be calculated:
1275
max_entries = self.directory_fp.size / 128
1276
debug('loaddirectory: size=%d, max_entries=%d' %
1277
(self.directory_fp.size, max_entries))
1279
# Create list of directory entries
1280
#self.direntries = []
1281
# We start with a list of "None" object
1282
self.direntries = [None] * max_entries
1283
## for sid in xrange(max_entries):
1284
## entry = fp.read(128)
1287
## self.direntries.append(_OleDirectoryEntry(entry, sid, self))
1289
root_entry = self._load_direntry(0)
1290
# Root entry is the first entry:
1291
self.root = self.direntries[0]
1292
# read and build all storage trees, starting from the root:
1293
self.root.build_storage_tree()
1296
def _load_direntry (self, sid):
1298
Load a directory entry from the directory.
1299
This method should only be called once for each storage/stream when
1300
loading the directory.
1301
sid: index of storage/stream in the directory.
1302
return: a _OleDirectoryEntry object
1303
raise: IOError if the entry has always been referenced.
1305
# check if SID is OK:
1306
if sid<0 or sid>=len(self.direntries):
1307
self._raise_defect(DEFECT_FATAL, "OLE directory index out of range")
1308
# check if entry was already referenced:
1309
if self.direntries[sid] is not None:
1310
self._raise_defect(DEFECT_INCORRECT,
1311
"double reference for OLE stream/storage")
1312
# if exception not raised, return the object
1313
return self.direntries[sid]
1314
self.directory_fp.seek(sid * 128)
1315
entry = self.directory_fp.read(128)
1316
self.direntries[sid] = _OleDirectoryEntry(entry, sid, self)
1317
return self.direntries[sid]
1320
def dumpdirectory(self):
1322
Dump directory (for debugging only)
1327
def _open(self, start, size = 0x7FFFFFFF, force_FAT=False):
1329
Open a stream, either in FAT or MiniFAT according to its size.
1332
start: index of first sector
1333
size: size of stream (or nothing if size is unknown)
1334
force_FAT: if False (default), stream will be opened in FAT or MiniFAT
1335
according to size. If True, it will always be opened in FAT.
1337
debug('OleFileIO.open(): sect=%d, size=%d, force_FAT=%s' %
1338
(start, size, str(force_FAT)))
1339
# stream size is compared to the MiniSectorCutoff threshold:
1340
if size < self.minisectorcutoff and not force_FAT:
1342
if not self.ministream:
1343
# load MiniFAT if it wasn't already done:
1345
# The first sector index of the miniFAT stream is stored in the
1346
# root directory entry:
1347
size_ministream = self.root.size
1348
debug('Opening MiniStream: sect=%d, size=%d' %
1349
(self.root.isectStart, size_ministream))
1350
self.ministream = self._open(self.root.isectStart,
1351
size_ministream, force_FAT=True)
1352
return _OleStream(self.ministream, start, size, 0,
1353
self.minisectorsize, self.minifat)
1356
return _OleStream(self.fp, start, size, 512,
1357
self.sectorsize, self.fat)
1360
def _list(self, files, prefix, node):
1363
files: list of files to fill in
1364
prefix: current location in storage tree (list of names)
1365
node: current node (_OleDirectoryEntry object)
1367
prefix = prefix + [node.name]
1368
for entry in node.kids:
1370
self._list(files, prefix, entry)
1372
files.append(prefix[1:] + [entry.name])
1377
Return a list of streams stored in this file
1380
self._list(files, [], self.root)
1384
def _find(self, filename):
1386
Returns directory entry of given filename. (openstream helper)
1387
Note: this method is case-insensitive.
1389
filename: path of stream in storage tree (except root entry), either:
1390
- a string using Unix path syntax, for example:
1391
'storage_1/storage_1.2/stream'
1392
- a list of storage filenames, path to the desired stream/storage.
1393
Example: ['storage_1', 'storage_1.2', 'stream']
1394
return: sid of requested filename
1395
raise IOError if file not found
1398
# if filename is a string instead of a list, split it on slashes to
1399
# convert to a list:
1400
if isinstance(filename, basestring):
1401
filename = filename.split('/')
1402
# walk across storage tree, following given path:
1404
for name in filename:
1405
for kid in node.kids:
1406
if kid.name.lower() == name.lower():
1409
raise IOError, "file not found"
1414
def openstream(self, filename):
1416
Open a stream as a read-only file object (StringIO).
1418
filename: path of stream in storage tree (except root entry), either:
1419
- a string using Unix path syntax, for example:
1420
'storage_1/storage_1.2/stream'
1421
- a list of storage filenames, path to the desired stream/storage.
1422
Example: ['storage_1', 'storage_1.2', 'stream']
1423
return: file object (read-only)
1424
raise IOError if filename not found, or if this is not a stream.
1426
sid = self._find(filename)
1427
entry = self.direntries[sid]
1428
if entry.entry_type != STGTY_STREAM:
1429
raise IOError, "this file is not a stream"
1430
return self._open(entry.isectStart, entry.size)
1433
def get_type(self, filename):
1435
Test if given filename exists as a stream or a storage in the OLE
1436
container, and return its type.
1438
filename: path of stream in storage tree. (see openstream for syntax)
1439
return: False if object does not exist, its entry type (>0) otherwise:
1440
- STGTY_STREAM: a stream
1441
- STGTY_STORAGE: a storage
1442
- STGTY_ROOT: the root entry
1445
sid = self._find(filename)
1446
entry = self.direntries[sid]
1447
return entry.entry_type
1452
def exists(self, filename):
1454
Test if given filename exists as a stream or a storage in the OLE
1457
filename: path of stream in storage tree. (see openstream for syntax)
1458
return: True if object exist, else False.
1461
sid = self._find(filename)
1467
def get_size(self, filename):
1469
Return size of a stream in the OLE container, in bytes.
1471
filename: path of stream in storage tree (see openstream for syntax)
1472
return: size in bytes (long integer)
1473
raise: IOError if file not found, TypeError if this is not a stream.
1475
sid = self._find(filename)
1476
entry = self.direntries[sid]
1477
if entry.entry_type != STGTY_STREAM:
1478
#TODO: Should it return zero instead of raising an exception ?
1479
raise TypeError, 'object is not an OLE stream'
1483
def get_rootentry_name(self):
1485
Return root entry name. Should usually be 'Root Entry' or 'R' in most
1488
return self.root.name
1491
def getproperties(self, filename):
1493
Return properties described in substream.
1495
filename: path of stream in storage tree (see openstream for syntax)
1496
return: a dictionary of values indexed by id (integer)
1498
fp = self.openstream(filename)
1504
clsid = _clsid(s[8:24])
1508
fmtid = _clsid(s[:16])
1512
s = "****" + fp.read(i32(fp.read(4))-4)
1514
for i in range(i32(s, 4)):
1517
offset = i32(s, 12+i*8)
1518
type = i32(s, offset)
1520
debug ('property id=%d: type=%d offset=%X' % (id, type, offset))
1522
# test for common types first (should perhaps use
1523
# a dictionary instead?)
1526
value = i16(s, offset+4)
1528
value = value - 65536
1529
elif type == VT_UI2:
1530
value = i16(s, offset+4)
1531
elif type in (VT_I4, VT_ERROR):
1532
value = i32(s, offset+4)
1533
elif type == VT_UI4:
1534
value = i32(s, offset+4) # FIXME
1535
elif type in (VT_BSTR, VT_LPSTR):
1536
count = i32(s, offset+4)
1537
value = s[offset+8:offset+8+count-1]
1538
elif type == VT_BLOB:
1539
count = i32(s, offset+4)
1540
value = s[offset+8:offset+8+count]
1541
elif type == VT_LPWSTR:
1542
count = i32(s, offset+4)
1543
value = self._unicode(s[offset+8:offset+8+count*2])
1544
elif type == VT_FILETIME:
1545
value = long(i32(s, offset+4)) + (long(i32(s, offset+8))<<32)
1546
# FIXME: this is a 64-bit int: "number of 100ns periods
1547
# since Jan 1,1601". Should map this to Python time
1548
value = value / 10000000L # seconds
1549
elif type == VT_UI1:
1550
value = ord(s[offset+4])
1551
elif type == VT_CLSID:
1552
value = _clsid(s[offset+4:offset+20])
1554
count = i32(s, offset+4)
1555
value = s[offset+8:offset+8+count]
1557
value = None # everything else yields "None"
1559
# FIXME: add support for VT_VECTOR
1561
#print "%08x" % id, repr(value),
1562
#print "(%s)" % VT[i32(s, offset) & 0xFFF]
1569
# --------------------------------------------------------------------
1570
# This script can be used to dump the directory of any OLE2 structured
1573
if __name__ == "__main__":
1577
# [PL] display quick usage info if launched from command-line
1578
if len(sys.argv) <= 1:
1581
Launched from command line, this script parses OLE files and prints info.
1583
Usage: OleFileIO_PL.py [-d] [-c] <file> [file2 ...]
1586
-d : debug mode (display a lot of debug information, for developers only)
1587
-c : check all streams (for debugging purposes)
1591
check_streams = False
1592
for filename in sys.argv[1:]:
1595
if filename == '-d':
1596
# option to switch debug mode on:
1597
set_debug_mode(True)
1599
if filename == '-c':
1600
# option to switch check streams mode on:
1601
check_streams = True
1604
ole = OleFileIO(filename, raise_defects=DEFECT_INCORRECT)
1609
for streamname in ole.listdir():
1610
if streamname[-1][0] == "\005":
1611
print streamname, ": properties"
1612
props = ole.getproperties(streamname)
1613
props = props.items()
1616
#[PL]: avoid to display too large or binary values:
1617
if isinstance(v, basestring):
1620
# quick and dirty binary check:
1621
for c in (1,2,3,4,5,6,7,11,12,14,15,16,17,18,19,20,
1622
21,22,23,24,25,26,27,28,29,30,31):
1629
# Read all streams to check if there are errors:
1630
print '\nChecking streams...'
1631
for streamname in ole.listdir():
1632
# print name using repr() to convert binary chars to \xNN:
1633
print '-', repr('/'.join(streamname)),'-',
1634
st_type = ole.get_type(streamname)
1635
if st_type == STGTY_STREAM:
1636
print 'size %d' % ole.get_size(streamname)
1637
# just try to read stream in memory:
1638
ole.openstream(streamname)
1640
print 'NOT a stream : type=%d' % st_type
1643
#[PL] Test a few new methods:
1644
root = ole.get_rootentry_name()
1645
print 'Root entry name: "%s"' % root
1646
if ole.exists('worddocument'):
1647
print "This is a Word document."
1648
print "type of stream 'WordDocument':", ole.get_type('worddocument')
1649
print "size :", ole.get_size('worddocument')
1650
if ole.exists('macros/vba'):
1651
print "This document may contain VBA macros."
1652
## except IOError, v:
1653
## print "***", "cannot read", file, "-", v