12
cdef double NaN = <double> np.NaN
14
from datetime import datetime as pydatetime
16
from python_dict cimport *
17
from numpy cimport ndarray, npy_float64, npy_int32, npy_int8, npy_float128
21
cdef inline object trycall(object func, object arg):
26
raise Exception('Error calling func on index %s' % arg)
29
cdef inline int int_max(int a, int b): return a if a >= b else b
30
cdef inline int int_min(int a, int b): return a if a >= b else b
32
def map_indices(ndarray index):
34
Produce a dict mapping the values of the input array to their respective
38
array(['hi', 'there']) --> {'hi' : 0 , 'there' : 1}
40
Better to do this with Cython because of the enormous speed boost.
49
iter = PyArray_IterNew(index)
51
length = PyArray_SIZE(index)
53
for i from 0 <= i < length:
54
idx = PyArray_GETITEM(index, <void *> iter.dataptr)
56
PyArray_ITER_NEXT(iter)
60
def match(ndarray A, ndarray B):
64
Close equivalent of R's match function.
66
For given input index A, find matching locations for values of A in B.
70
array([[ 0. , 0.26929312],
81
array([1, 3, 6, 8, 4, 5, 7, 0, 2, 9])
83
# Now with match we can realign b based on a
85
>>> b[match(a, b[:,0]),:]
86
array([[ 1. , 0.49540359],
102
cdef double *result_data
107
nan = <double> np.NaN
109
bmap = map_indices(B)
111
itera = PyArray_IterNew(A)
112
length = PyArray_SIZE(A)
114
result = <ndarray> np.empty(length, np.float64)
116
result_data = <double *> result.data
118
for i from 0 <= i < length:
119
idx = PyArray_GETITEM(A, <void *> itera.dataptr)
121
result_data[i] = <double> bmap[idx]
125
PyArray_ITER_NEXT(itera)
127
return result.astype(int)
129
def reindex(ndarray index, ndarray arr, dict idxMap):
131
Using the provided new index, a given array, and a mapping of index-value
132
correpondences in the value array, return a new ndarray conforming to
135
This is significantly faster than doing it in pure Python.
138
cdef double *result_data
140
cdef flatiter itera, iteridx
144
nan = <double> np.NaN
146
length = PyArray_SIZE(index)
148
result = <ndarray> np.empty(length, np.float64)
150
result_data = <double *> result.data
152
itera = PyArray_IterNew(arr)
153
iteridx = PyArray_IterNew(index)
155
for i from 0 <= i < length:
156
idx = PyArray_GETITEM(index, <void *> iteridx.dataptr)
157
PyArray_ITER_NEXT(iteridx)
158
if idx not in idxMap:
161
PyArray_ITER_GOTO1D(itera, idxMap[idx])
162
result_data[i] = (<double *>(itera.dataptr))[0]
166
def reindexObj(ndarray index, ndarray arr, dict idxMap):
168
Using the provided new index, a given array, and a mapping of index-value
169
correpondences in the value array, return a new ndarray conforming to
172
This is significantly faster than doing it in pure Python.
176
cdef flatiter itera, iteridx, iterresult
177
cdef object idx, nan, obj
180
length = PyArray_SIZE(index)
182
result = <ndarray> np.empty(length, dtype=np.object_)
184
itera = PyArray_IterNew(arr)
185
iteridx = PyArray_IterNew(index)
186
iterresult = PyArray_IterNew(result)
190
for i from 0 <= i < length:
191
idx = PyArray_GETITEM(index, <void *> iteridx.dataptr)
192
PyArray_ITER_NEXT(iteridx)
194
if idx not in idxMap:
195
PyArray_SETITEM(result, <void *> iterresult.dataptr, nan)
196
PyArray_ITER_NEXT(iterresult)
199
PyArray_ITER_GOTO1D(itera, idxMap[idx])
200
obj = PyArray_GETITEM(arr, <void *> itera.dataptr)
202
res = PyArray_SETITEM(result, <void *> iterresult.dataptr, obj)
203
PyArray_ITER_NEXT(iterresult)
207
@cython.boundscheck(False)
208
def reindexObject(ndarray[object, ndim=1] index,
209
ndarray[object, ndim=1] arr,
212
Using the provided new index, a given array, and a mapping of index-value
213
correpondences in the value array, return a new ndarray conforming to
216
cdef int j, loc, length
217
cdef object idx, value
218
cdef object nan = np.NaN
220
length = index.shape[0]
221
cdef ndarray[object, ndim = 1] result = np.empty(length, dtype=object)
225
for i from 0 <= i < length:
227
if not PyDict_Contains(idxMap, idx):
230
value = arr[idxMap[idx]]
234
cdef tuple _nofill(ndarray oldIndex, ndarray newIndex, dict oldMap, dict newMap):
237
cdef int i, j, length, newLength
239
cdef flatiter iterold
244
fillVec = <ndarray> np.empty(len(newIndex), dtype = np.int32)
245
maskVec = <ndarray> np.zeros(len(newIndex), dtype = np.int8)
247
fillLocs = <int *> fillVec.data
248
mask = <char *> maskVec.data
250
newLength = PyArray_SIZE(fillVec)
252
length = PyArray_SIZE(oldIndex)
253
iterold = PyArray_IterNew(oldIndex)
255
for i from 0 <= i < length:
256
idx = PyArray_GETITEM(oldIndex, <void *> iterold.dataptr)
258
PyArray_ITER_NEXT(iterold)
264
for i from 0 <= i < newLength:
268
return fillVec, maskVec
270
cdef tuple _backfill(ndarray oldIndex, ndarray newIndex, dict oldMap, dict newMap):
272
Backfilling logic for generating fill vector
274
Diagram of what's going on
276
Old New Fill vector Mask
295
cdef int i, j, oldLength, newLength, curLoc
299
fillVec = <ndarray> np.empty(len(newIndex), dtype = np.int32)
300
maskVec = <ndarray> np.zeros(len(newIndex), dtype = np.int8)
305
fillLocs = <int *> fillVec.data
306
mask = <char *> maskVec.data
308
# Create the iterators
309
cdef flatiter iterold, iternew
310
iterold = PyArray_IterNew(oldIndex)
311
iternew = PyArray_IterNew(newIndex)
314
oldLength = PyArray_SIZE(oldIndex)
315
newLength = PyArray_SIZE(newIndex)
318
cdef int newPos, oldPos
319
oldPos = oldLength - 1
320
newPos = newLength - 1
322
# References holding indices
323
cdef object prevOld, curOld
326
# Move to the current position
327
PyArray_ITER_GOTO1D(iternew, newPos)
328
PyArray_ITER_GOTO1D(iterold, oldPos)
330
# Get the current index
331
curOld = PyArray_GETITEM(oldIndex, <void *> iterold.dataptr)
333
# Until we reach a point where we are before the curOld point
334
while PyArray_GETITEM(newIndex, <void *> iternew.dataptr) > curOld:
338
PyArray_ITER_GOTO1D(iternew, newPos)
340
# Get the location in the old index
341
curLoc = oldMap[curOld]
343
# At the beginning of the old index
346
# Make sure we are before the curOld index
347
if PyArray_GETITEM(newIndex, <void *> iternew.dataptr) <= curOld:
348
fillVec[:newPos + 1] = curLoc
349
maskVec[:newPos + 1] = 1
355
# Move one position back
356
PyArray_ITER_GOTO1D(iterold, oldPos - 1)
358
# Get the index there
359
prevOld = PyArray_GETITEM(oldIndex, <void *> iterold.dataptr)
361
# Until we reach the previous index
362
while PyArray_GETITEM(newIndex, <void *> iternew.dataptr) > prevOld:
364
# Set the current fill location
365
fillLocs[newPos] = curLoc
372
# Move the iterator back
373
PyArray_ITER_GOTO1D(iternew, newPos)
375
# Move one period back
378
for i from 0 <= i < newLength:
380
# Fill from some generic location
383
return (fillVec, maskVec)
385
cdef tuple _pad(ndarray oldIndex, ndarray newIndex, dict oldMap, dict newMap):
387
Padding logic for generating fill vector
389
Diagram of what's going on
391
Old New Fill vector Mask
413
cdef int i, j, oldLength, newLength, curLoc, newPos, oldPos
414
cdef flatiter iterold, iternew
415
cdef object nextOld, curOld
418
# Make empty fill vector and mask vector, cast to ndarray
419
fillVec = <ndarray> np.empty(len(newIndex), dtype = np.int32)
420
maskVec = <ndarray> np.zeros(len(newIndex), dtype = np.int8)
422
# Get reference to the arrays inside
423
fillLocs = <int *> fillVec.data
424
mask = <char *> maskVec.data
426
# Create simple ndarray iterators using C API
427
iterold = PyArray_IterNew(oldIndex)
428
iternew = PyArray_IterNew(newIndex)
430
# Length of each index
431
oldLength = PyArray_SIZE(oldIndex)
432
newLength = PyArray_SIZE(newIndex)
436
while newPos < newLength:
437
curOld = PyArray_GETITEM(oldIndex, <void *> iterold.dataptr)
439
# At beginning, keep going until we go exceed the
440
# first OLD index in the NEW index
441
while PyArray_GETITEM(newIndex, <void *> iternew.dataptr) < curOld:
443
if newPos > newLength - 1:
445
PyArray_ITER_NEXT(iternew)
447
# We got there, get the current location in the old index
448
curLoc = oldMap[curOld]
450
# We're at the end of the road, need to propagate this value to the end
451
if oldPos == oldLength - 1:
452
if PyArray_GETITEM(newIndex, <void *> iternew.dataptr) >= curOld:
453
fillVec[newPos:] = curLoc
457
# Not at the end, need to go about filling
459
# Get the next index so we know when to stop propagating this value
460
PyArray_ITER_NEXT(iterold)
461
nextOld = PyArray_GETITEM(oldIndex, <void *> iterold.dataptr)
465
# Until we reach the next OLD value in the NEW index
466
while PyArray_GETITEM(newIndex, <void *> iternew.dataptr) < nextOld:
468
# Use this location to fill
469
fillLocs[newPos] = curLoc
471
# Set mask to be 1 so will not be NaN'd
475
# We got to the end of the new index
476
if newPos > newLength - 1:
480
# Advance the pointer
481
PyArray_ITER_NEXT(iternew)
483
# We got to the end of the new index
487
# We already advanced the iterold pointer to the next value,
491
# Places where the mask is 0, fill with an arbitrary value
493
for i from 0 <= i < newLength:
497
return fillVec, maskVec
499
def getFillVec(ndarray oldIndex, ndarray newIndex, dict oldMap, dict newMap,
503
fillVec, maskVec = _nofill(oldIndex, newIndex, oldMap, newMap)
505
fillVec, maskVec = _pad(oldIndex, newIndex, oldMap, newMap)
506
elif kind == 'BACKFILL':
507
fillVec, maskVec = _backfill(oldIndex, newIndex, oldMap, newMap)
509
return fillVec, maskVec.astype(np.bool)
511
def getMergeVec(ndarray values, dict indexMap):
514
cdef int i, j, length
516
cdef flatiter itervals
521
cdef int newLength = len(values)
523
fillVec = <ndarray> np.empty(newLength, dtype = np.int32)
524
maskVec = <ndarray> np.zeros(newLength, dtype = np.int8)
526
fillLocs = <int *> fillVec.data
527
mask = <char *> maskVec.data
529
length = PyArray_SIZE(values)
530
itervals = PyArray_IterNew(values)
532
for i from 0 <= i < length:
533
val = PyArray_GETITEM(values, <void *> itervals.dataptr)
539
PyArray_ITER_NEXT(itervals)
541
for i from 0 <= i < newLength:
545
return fillVec, maskVec.astype(np.bool)
547
cdef double INF = <double> np.inf
548
cdef double NEGINF = -INF
550
cdef inline _checknull(object val):
551
return val is None or val != val or val == INF or val == NEGINF
553
cdef ndarray _isnullobj(input):
556
cdef ndarray[npy_int8, ndim=1] result
559
length = PyArray_SIZE(input)
561
result = <ndarray> np.zeros(length, dtype=np.int8)
563
iter= PyArray_IterNew(input)
565
for i from 0 <= i < length:
566
val = PyArray_GETITEM(input, <void *> iter.dataptr)
571
PyArray_ITER_NEXT(iter)
577
Replacement for numpy.isnan / -numpy.isfinite which is suitable
578
for use on object arrays.
582
arr: ndarray or object value
586
boolean ndarray or boolean
588
cdef ndarray[npy_int8, ndim=1] result
590
if isinstance(input, np.ndarray):
591
if input.dtype.kind in ('O', 'S'):
592
result = _isnullobj(input)
594
return result.astype(np.bool)
596
return -np.isfinite(input)
598
return _checknull(input)
602
Replacement for numpy.isfinite / -numpy.isnan which is suitable
603
for use on object arrays.
607
arr: ndarray or object value
611
boolean ndarray or boolean
613
if isinstance(input, np.ndarray):
614
return -isnull(input)
616
return not bool(_checknull(input))
618
def reindexNew(ndarray index, ndarray arr, dict idxMap):
620
Using the provided new index, a given array, and a mapping of index-value
621
correpondences in the value array, return a new ndarray conforming to
624
This is significantly faster than doing it in pure Python.
627
cdef double *result_data
629
cdef flatiter itera, iteridx
633
nan = <double> np.NaN
635
length = PyArray_SIZE(index)
637
result = <ndarray> np.empty(length, np.float64)
639
result_data = <double *> result.data
641
itera = PyArray_IterNew(arr)
642
iteridx = PyArray_IterNew(index)
644
for i from 0 <= i < length:
645
idx = PyArray_GETITEM(index, <void *> iteridx.dataptr)
646
PyArray_ITER_NEXT(iteridx)
647
if idx not in idxMap:
650
PyArray_ITER_GOTO1D(itera, idxMap[idx])
651
result_data[i] = (<double *>(itera.dataptr))[0]
655
cdef double __add(double a, double b):
657
cdef double __sub(double a, double b):
659
cdef double __div(double a, double b):
661
cdef double __mul(double a, double b):
663
cdef double __eq(double a, double b):
665
cdef double __ne(double a, double b):
667
cdef double __lt(double a, double b):
669
cdef double __gt(double a, double b):
671
cdef double __pow(double a, double b):
674
ctypedef double (* double_func)(double a, double b)
676
cdef ndarray _applyFunc(double_func func, ndarray index, object ao,
677
object bo, dict aMap, dict bMap):
679
C function taking a function pointer for quickly adding two Series objects.
681
cdef ndarray A, B, result
682
cdef double *result_data
684
cdef flatiter itera, iterb, iteridx
688
# This is EXTREMELY important, otherwise you will get very
690
A = PyArray_ContiguousFromAny(ao, NPY_DOUBLE, 1, 1)
691
B = PyArray_ContiguousFromAny(bo, NPY_DOUBLE, 1, 1)
693
nan = <double> np.NaN
694
length = PyArray_SIZE(index)
696
result = <ndarray> np.empty(length, np.float64)
697
result_data = <double *>result.data
699
itera = <flatiter> PyArray_IterNew(A)
700
iterb = <flatiter> PyArray_IterNew(B)
701
iteridx = PyArray_IterNew(index)
703
for i from 0 <= i < length:
704
idx = PyArray_GETITEM(index, <void *> iteridx.dataptr)
705
PyArray_ITER_NEXT(iteridx)
707
if idx not in aMap or idx not in bMap:
711
result_data[i] = func((<double *>A.data)[aMap[idx]],
712
(<double *>B.data)[bMap[idx]])
716
def combineFunc(object name, ndarray index, object ao,
717
object bo, dict aMap, dict bMap):
719
Combine two series (values and index maps for each passed in) using the
722
if name == "__add__":
723
return _applyFunc(__add, index, ao, bo, aMap, bMap)
724
elif name == "__sub__":
725
return _applyFunc(__sub, index, ao, bo, aMap, bMap)
726
elif name == "__div__":
727
return _applyFunc(__div, index, ao, bo, aMap, bMap)
728
elif name == "__mul__":
729
return _applyFunc(__mul, index, ao, bo, aMap, bMap)
730
elif name == "__eq__":
731
return _applyFunc(__eq, index, ao, bo, aMap, bMap)
732
elif name == "__ne__":
733
return _applyFunc(__ne, index, ao, bo, aMap, bMap)
734
elif name == "__lt__":
735
return _applyFunc(__lt, index, ao, bo, aMap, bMap)
736
elif name == "__gt__":
737
return _applyFunc(__gt, index, ao, bo, aMap, bMap)
738
elif name == "__pow__":
739
return _applyFunc(__pow, index, ao, bo, aMap, bMap)
741
raise Exception('bad funcname requested of Cython code')
743
#-------------------------------------------------------------------------------
744
# Groupby-related functions
746
@cython.boundscheck(False)
747
def arrmap(ndarray[object, ndim=1] index, object func):
748
cdef int length = index.shape[0]
751
cdef ndarray[object, ndim=1] result = np.empty(length, dtype=np.object_)
753
for i from 0 <= i < length:
754
result[i] = func(index[i])
758
@cython.boundscheck(False)
759
def groupby_withnull_old(ndarray[object, ndim = 1] index, object keyfunc):
761
cdef int length = index.shape[0]
763
cdef object curKey, key
766
groups = PyDict_New()
768
if length != index.shape[0]:
769
raise Exception('Dates and values were not the same length!')
771
cdef ndarray[object, ndim=1] mapped_index = arrmap(index, keyfunc)
773
cdef ndarray[npy_int8, ndim=1] null_mask = _isnullobj(mapped_index)
775
bool_mask = null_mask.astype(bool)
777
null_values = np.asarray(index)[bool_mask]
779
if null_values.any():
780
PyDict_SetItem(groups, np.NaN, null_values)
784
key = mapped_index[0]
787
# - Tries to reduce the number of calls to PyDict_GetItem,
791
if not PyDict_Contains(groups, key):
793
PyDict_SetItem(groups, key, members)
802
key = mapped_index[i]
809
members = <list> PyDict_GetItem(groups, key)
813
while null_mask[i] and i < length:
822
key = mapped_index[i]
831
@cython.boundscheck(False)
832
def groupby_withnull(ndarray[object, ndim = 1] index, object keyfunc):
834
cdef int length = index.shape[0]
836
cdef object curKey, key
839
groups = PyDict_New()
841
if length != index.shape[0]:
842
raise Exception('Dates and values were not the same length!')
844
cdef ndarray[object, ndim=1] mapped_index = arrmap(index, keyfunc)
846
cdef ndarray[npy_int8, ndim=1] null_mask = _isnullobj(mapped_index)
848
bool_mask = null_mask.astype(bool)
850
null_values = np.asarray(index)[bool_mask]
852
if null_values.any():
853
PyDict_SetItem(groups, np.NaN, null_values)
857
key = mapped_index[0]
860
# - Tries to reduce the number of calls to PyDict_GetItem,
864
if key not in groups:
866
groups[key] = members
875
key = mapped_index[i]
882
members = <list> groups[key]
886
while null_mask[i] and i < length:
895
key = mapped_index[i]
904
@cython.boundscheck(False)
905
def groupby(ndarray[object, ndim = 1] index, object keyfunc):
907
cdef int length = index.shape[0]
909
cdef object curKey, key
912
groups = PyDict_New()
914
if length != index.shape[0]:
915
raise Exception('Dates and values were not the same length!')
922
# - Tries to reduce the number of calls to PyDict_GetItem, 'lazily' evaluates
925
if not PyDict_Contains(groups, key):
927
PyDict_SetItem(groups, key, members)
932
key = trycall(keyfunc, idx)
939
members = <list> PyDict_GetItem(groups, key)
945
key = trycall(keyfunc, idx)
954
@cython.boundscheck(False)
955
def groupbyfunc(ndarray[object, ndim = 1] index,
956
ndarray[npy_float64, ndim = 1] values,
957
object keyfunc, object applyfunc):
959
Doing this proper in Cython
960
Not sure how much it will really speed things up
963
cdef int length = values.shape[0]
965
cdef object curKey, key
966
cdef list members, grouplist
968
groups = PyDict_New()
970
if length != index.shape[0]:
971
raise Exception('Dates and values were not the same length!')
975
key = trycall(keyfunc, idx)
978
# - Tries to reduce the number of calls to PyDict_GetItem,
982
if not PyDict_Contains(groups, key):
983
members = [values[i]]
984
PyDict_SetItem(groups, key, members)
989
key = trycall(keyfunc, idx)
991
members.append(values[i])
996
members = <list> PyDict_GetItem(groups, key)
997
members.append(values[i])
1002
key = trycall(keyfunc, idx)
1004
members.append(values[i])
1009
grouplist = PyDict_Keys(groups)
1012
length = len(grouplist)
1013
for i from 0 <= i < length:
1015
members = <list> PyDict_GetItem(groups, key)
1016
PyDict_SetItem(groups, key, applyfunc(np.asarray(members)))