30
__version__ = "$Revision: 1.10.2.1 $"
31
# default version for IndexARRAY objects
32
obversion = "1.0" # initial version
34
import types, warnings, sys
35
from EArray import EArray
36
from VLArray import Atom, StringAtom
39
33
import numarray.strings as strings
40
34
import numarray.records as records
42
def calcChunksize(expectedrows, testmode=0):
36
import tables.hdf5Extension as hdf5Extension
37
from tables.Atom import Atom, StringAtom
38
from tables.EArray import EArray
41
__version__ = "$Revision: 1496 $"
43
# default version for IndexARRAY objects
44
obversion = "1.0" # initial version
46
# The minimum row number in a column that can be indexed in tests
50
def calcChunksize(expectedrows, testmode=False):
43
51
"""Calculate the HDF5 chunk size for index and sorted arrays.
45
53
The logic to do that is based purely in experiments playing with
53
expKrows = expectedrows / 1000000. # Multiples of one million
56
if expKrows < 0.0001: # expected rows < 1 hundred
57
nelemslice = 10 # > 1/100th
59
elif expKrows < 0.001: # expected rows < 1 thousand
60
nelemslice = 100 # > 1/10th
62
if expectedrows < minRowIndex*10:
63
nelemslice = minRowIndex
64
chunksize = minRowIndex
65
elif expectedrows < minRowIndex*100:
62
elif expKrows <= 0.01: # expected rows < 10 thousand
63
nelemslice = 1000 # > 1/100th
68
elif expectedrows <= minRowIndex*1000:
66
72
raise ValueError, \
67
"expected rows cannot be larger than 10000 in test mode"
73
"expected rows cannot be larger than %s in test mode" % minRowIndex*1000
74
#print "nelemslice, chunksize:", (nelemslice, chunksize)
68
75
return (nelemslice, chunksize)
77
expKrows = expectedrows / 1000000. # Multiples of one million
70
79
# expKrows < 0.01 is to few for indexing to represent a significant gain
71
80
# (that has been checked experimentally)
72
81
# if expKrows < 0.01: # expected rows < 10 thousand
143
154
extdim -- The enlargeable dimension (always the first, or 0).
144
155
nrows -- The number of slices in index.
145
156
nelemslice -- The number of elements per slice.
146
chunksize -- The HDF5 chunksize for the slice dimension (the 1).
157
chunksize -- The HDF5 chunksize for the slice dimension (the second).
151
def __init__(self, parent = None, atom = None, title = "",
152
filters = None, expectedrows = 1000000,
161
_c_classId = "INDEXARRAY"
164
def __init__(self, parentNode, name,
154
169
"""Create an IndexArray instance.
156
171
Keyword arguments:
158
parent -- The Index class from which this object will hang off
160
173
atom -- An Atom object representing the shape, type and flavor
161
174
of the atomic objects to be saved. Only scalar atoms are
164
177
title -- Sets a TITLE attribute on the array entity.
166
179
filters -- An instance of the Filters class that provides
168
181
during the life of this object.
170
183
expectedrows -- Represents an user estimate about the number
171
of elements to index. If not provided, the default
172
value is 1000000 slices.
184
of elements to index.
175
self._v_parent = parent
176
self._v_new_title = title
177
self._v_new_filters = filters
178
self._v_expectedrows = expectedrows
179
188
self.testmode = testmode
180
self.flavor = "NumArray" # Needed by Array methods
181
# Check if we have to create a new object or read their contents
189
"""Enables test mode for index chunk size calculation."""
190
self.nelemslice = None
191
"""The number of elements per slice."""
192
self.chunksize = None
193
"""The HDF5 chunksize for the slice dimension (the second)."""
195
# Compute the optimum number of slices and chunk sizes
196
# for newly created index arrays.
183
197
if atom is not None:
190
"""Save a fresh array (i.e., not present on HDF5 file)."""
193
assert isinstance(self.atom, Atom), "The object passed to the IndexArray constructor must be a descendent of the Atom class."
194
assert self.atom.shape == 1, "Only scalar columns can be indexed."
195
# Version, type, shape, flavor, byteorder
196
self._v_version = obversion
197
self.type = self.atom.type
198
if self.type == "CharType" or isinstance(self.type, records.Char):
199
self.byteorder = "non-relevant"
201
# Only support for creating objects in system byteorder
202
self.byteorder = sys.byteorder
203
# Compute the optimal chunksize
204
(self.nelemslice, self.chunksize) = \
205
calcChunksize(self._v_expectedrows,
206
testmode=self.testmode)
207
# The next is needed by hdf5Extension.Array._createEArray
208
self._v_chunksize = (1, self.chunksize)
209
self.nrows = 0 # No rows initially
210
self.itemsize = self.atom.itemsize
211
self.rowsize = self.atom.atomsize() * self.nelemslice
198
(self.nelemslice, self.chunksize) = (
199
calcChunksize(expectedrows, testmode))
201
# Index creation is never logged.
202
super(IndexArray, self).__init__(
203
parentNode, name, atom, title, filters, expectedrows, log=False)
207
assert self.atom.shape == (0, 1), "only scalar columns can be indexed"
208
objectId = super(IndexArray, self)._g_create()
209
assert self.extdim == 0, "computed extendable dimension is wrong"
210
assert self.shape == (0, self.nelemslice), "invalid shape"
211
assert self._v_chunksize == (1, self.chunksize), "invalid chunk size"
215
def _calcTuplesAndChunks(self, atom, extdim, expectedrows, compress):
216
return (0, (1, self.chunksize)) # (_v_maxTuples, _v_chunksize)
219
def _createEArray(self, title):
220
# The shape of the index array needs to be fixed before creating it.
212
221
self.shape = (0, self.nelemslice)
216
# Compute the optimal maxTuples
217
# Ten chunks for each buffer would be enough for IndexArray objects
218
# This is really necessary??
219
self._v_maxTuples = 10
220
# Create the IndexArray
221
self._createEArray("INDEXARRAY", self._v_new_title)
224
"""Get the metadata info for an array in file."""
225
(self.type, self.shape, self.itemsize,
226
self.byteorder, chunksizes) = self._openArray()
227
self.chunksize = chunksizes[1] # Get the second dim
229
assert self.extdim == 0, "extdim != 0: this should never happen!"
230
self.nelemslice = self.shape[1]
231
# Create the atom instance. Not for strings yet!
232
if str(self.type) == "CharType":
233
self.atom = StringAtom(shape=1, length=self.itemsize)
235
self.atom = Atom(dtype=self.type, shape=1)
236
# Compute the rowsize for each element
237
self.rowsize = self.atom.atomsize() * self.nelemslice
238
# nrows in this instance
239
self.nrows = self.shape[0]
240
# Compute the optimal maxTuples
241
# Ten chunks for each buffer would be enough for IndexArray objects
242
# This is really necessary??
243
self._v_maxTuples = 10
222
super(IndexArray, self)._createEArray(title)
225
def _g_postInitHook(self):
226
# Set ``nelemslice`` and ``chunksize`` when opening an existing node;
227
# otherwise, they are already set.
229
self.nelemslice = self.shape[1]
230
self.chunksize = self._v_chunksize[1]
231
super(IndexArray, self)._g_postInitHook()
245
234
def append(self, arr):
246
235
"""Append the object to this (enlargeable) object"""
247
236
arr.shape = (1, arr.shape[0])
248
237
self._append(arr)
250
240
# This is coded in pyrex as well, but the improvement in speed is very
251
241
# little. So, it's better to let _searchBin live here.
252
242
def _searchBin(self, nrow, item):
253
243
nelemslice = self.shape[1]
255
245
item1, item2 = item
256
246
item1done = 0; item2done = 0
257
247
chunksize = self.chunksize # Number of elements/chunksize
329
317
niter = niter + iter
330
318
return (result1, result2, niter)
333
"""Close this object and exit"""
334
# First, flush the buffers:
336
# Delete back references
342
self.__dict__.clear()
344
321
def __str__(self):
345
322
"A compact representation of this class"
346
return "IndexArray(path=%s)" % \
347
(self._v_parent._g_join(self.name))
323
return "IndexArray(path=%s)" % self._v_pathname
349
326
def __repr__(self):
350
327
"""A verbose representation of this class"""
359
byteorder = %r""" % (self, self.type, self.shape, self.itemsize, self.nrows,
360
self.nelemslice, self.chunksize, self.byteorder)
334
byteorder = %r""" % (self, self.atom, self.nrows, self.nelemslice,
335
self.chunksize, self.byteorder)