2
This file implement 1 version of the elemwise op on the gpu.
4
The elemwise fct are also used with scalar operation! So it can happen
5
that ndim is 0 as with all scalar type.
12
import pygpu_ndarray as gpu_ndarray
13
_CL_MODE = hasattr(gpu_ndarray, "set_opencl_context")
17
# THIS IS NOT FINISHED
19
import pyopencl.array as cl_array
20
from pyopencl.tools import dtype_to_ctype
21
# import pyopencl._mymako as mako
22
from pyopencl._cluda import CLUDA_PREAMBLE
23
# TODO: use mako to get rid of the %if
24
CLUDA_PREAMBLE = CLUDA_PREAMBLE[:455]
26
#define LDIM_0 get_local_size(0)
27
#define LDIM_1 get_local_size(1)
28
#define LDIM_2 get_local_size(2)
30
#define GDIM_0 get_num_groups(0)
31
#define GDIM_1 get_num_groups(1)
32
#define GDIM_2 get_num_groups(2)
34
# TODO, reuse the same context as the use used to create the memory.
35
ctx = cl.create_some_context()
36
queue = cl.CommandQueue(ctx)
38
import pycuda.autoinit
39
import pycuda.driver as driver
40
from pycuda.compiler import SourceModule
41
from pycuda.tools import dtype_to_ctype
42
# import pycuda._mymako as mako
43
from pycuda._cluda import CLUDA_PREAMBLE
45
#define LDIM_0 blockDim.x
46
#define LDIM_1 blockDim.y
47
#define LDIM_2 blockDim.z
49
#define GDIM_0 gridDim.x
50
#define GDIM_1 gridDim.y
51
#define GDIM_2 gridDim.z
54
from theano import Apply
55
from theano import scalar
56
from theano.tensor import TensorType
60
_logger_name = 'compyte.gen_elemwise'
61
_logger = logging.getLogger(_logger_name)
62
_logger.setLevel(logging.INFO)
63
_logger.addHandler(logging.StreamHandler()) # TO REMOVE
67
_logger.warning(_logger_name + 'WARNING: ' + ' '.join(str(m) for m in msg))
71
_logger.info(_logger_name + 'INFO: ' + ' '.join(str(m) for m in msg))
75
_logger.debug(_logger_name + 'DEBUG: ' + ' '.join(str(m) for m in msg))
79
gpu_ndarray.set_opencl_context(ctx.obj_ptr)
83
cast_uint = numpy.uintc
86
def _logical_scalar(x):
87
return numpy.all(x.type.broadcastable)
90
def get_str_list_logical_scalar(inputs, value_str='ii_i%i_value',
91
data_str='ii_i%i_data[0]'):
93
for ipos, i in enumerate(inputs):
94
if _logical_scalar(i):
95
l += [value_str % ipos]
97
l += [data_str % ipos]
101
class WrapOpenCLFunction(object):
102
def __init__(self, fct):
105
def _param_wrap(self, p):
106
if isinstance(p, MyGpuNdArray):
108
if isinstance(p, gpu_ndarray.GpuNdArrayObject):
109
p = cl.MemoryObject.from_cl_mem_as_int(p.bytes)
112
def set_block_shape(self, *shape):
113
self.local_size = shape
115
def param_set(self, *param):
116
self.param = [self._param_wrap(p) for p in param]
118
def launch_grid(self, *global_shape):
119
global_size = global_shape + (1,)
121
d = {"g_times_l": True}
122
return self.fct(queue, global_size, self.local_size,
126
def compile_gpu_code(code, fct_name):
128
# Compile the gpu function with pyopencl
129
prg = cl.Program(ctx, code).build()
130
fct2 = getattr(prg, fct_name)
132
fct = WrapOpenCLFunction(fct2)
134
# Compile the gpu function with pycuda
135
mod = SourceModule(code)
136
fct = mod.get_function(fct_name)
140
class ElemwiseAlgo(object):
141
verbose = 0 # 1, 2 or 3 for more verbose output.
143
cache_version = ('debug', 14, verbose)
145
def __init__(self, scalar_op, inplace_pattern={}):
147
:param scalar_op: the scalar operation to execute on each element.
149
self.scalar_op = scalar_op
150
self.inplace_pattern = inplace_pattern
152
def task_code(self, inputs, outputs, sio,
153
nodename, iname=None, oname=None):
155
iname = get_str_list_logical_scalar(inputs)
157
oname = ['ii_o%i_data[0]' % ipos for ipos, i in enumerate(outputs)]
158
print >> sio, self.scalar_op.c_code(
159
Apply(self.scalar_op,
160
[scalar.Scalar(dtype=input.type.dtype)()
161
for input in inputs],
162
[scalar.Scalar(dtype=output.type.dtype)()
163
for output in outputs]),
164
nodename + '_scalar_',
167
sub=dict(fail='return;')) # TODO: set a failure code somehow!!!
169
def c_src_kernel(self, inputs, outputs, nodename, nd, static="static"):
170
sio = StringIO.StringIO()
171
#print 'C_SRC_KERNEL', sio.getvalue()
173
for ipos, i in enumerate(inputs):
174
print >> sio, "// Input ", ipos, str(i.type)
175
for ipos, i in enumerate(outputs):
176
print >> sio, "// Output ", ipos, str(i.type)
177
print >> sio, static, (
178
"KERNEL void kernel_%s_%s(unsigned int numEls" % (nodename, nd))
180
print >> sio, "\t,", ", ".join("const int dim%i" % i
183
for ipos, i in enumerate(inputs):
184
s = ", ".join(["GLOBAL_MEM const %s * i%i_data" % (
185
dtype_to_ctype(i.dtype), ipos)] +
186
list("int i%i_str_%i" % (ipos, d)
187
for d in xrange(nd)))
188
print >> sio, "\t,", s
190
for ipos, i in enumerate(outputs):
191
s = ", ".join(["GLOBAL_MEM %s * o%i_data" % (
192
dtype_to_ctype(i.dtype), ipos)]
193
+ list("int o%i_str_%i" % (ipos, d)
194
for d in xrange(nd)))
195
print >> sio, "\t,", s
196
#print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d)
197
# for d in xrange(nd))
198
#print >> sio, "\t,", "float * o%i_data" % ipos
199
print >> sio, "\t)\n{"
200
print >> sio, " const int idx = GID_0 * LDIM_0 + LID_0;"
201
print >> sio, " const int numThreads = LDIM_0 * GDIM_0;"
203
# For each input that is a scalar which has been broadcasted
204
# to a tensor, load it into a local variable
205
for ipos, i in enumerate(inputs):
206
if _logical_scalar(i):
207
print >> sio, " const %s ii_i%i_value = i%i_data[0];" % (
208
dtype_to_ctype(i.dtype), ipos, ipos)
210
#loop over the elements to be treated by this kernel call
211
print >> sio, " for (int i = idx; i < numEls; i += numThreads) {"
212
# calculate the data pointers for all arguments
213
print >> sio, " int ii = i;"
214
for ipos, i in enumerate(inputs):
215
if not _logical_scalar(i):
216
print >> sio, (" GLOBAL_MEM const "
217
"%s * ii_i%i_data = i%i_data;" % (
218
dtype_to_ctype(i.dtype), ipos, ipos))
219
for ipos, i in enumerate(outputs):
220
print >> sio, " GLOBAL_MEM %s * ii_o%i_data = o%i_data;" % (
221
dtype_to_ctype(i.dtype), ipos, ipos)
222
for d in xrange(nd - 1, -1, -1):
224
print >> sio, " int pos%i = ii %% dim%i;" % (d, d)
225
print >> sio, " ii = ii / dim%i;" % d
227
print >> sio, " int pos%i = ii;" % d
229
for ipos, i in enumerate(inputs):
230
if not _logical_scalar(i):
231
print >> sio, (" ii_i"
232
"%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d))
233
for ipos, i in enumerate(outputs):
234
print >> sio, " ii_o%i_data += pos%i * o%i_str_%i;" % (
237
# perform the scalar operation on the input and output references
238
#TODO: What if the scalar_op needs support_code??
239
self.task_code(inputs, outputs, sio, nodename)
242
#indent = " "*(4*d+7)
243
#for ipos, i in enumerate(inputs):
244
#print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
247
#print sio.getvalue()
248
return sio.getvalue()
250
def c_src_kernel_Ccontiguous(self, inputs, outputs,
251
nodename, static="static"):
252
nd = outputs[0].type.ndim
253
sio = StringIO.StringIO()
254
#print 'C_SRC_KERNEL', sio.getvalue()
256
for ipos, i in enumerate(inputs):
257
print >> sio, "// Input ", ipos, str(i.type)
258
for ipos, i in enumerate(outputs):
259
print >> sio, "// Output ", ipos, str(i.type)
260
print >> sio, static, ("KERNEL void kernel_%s_Ccontiguous"
261
" (unsigned int numEls" % (nodename))
263
for ipos, i in enumerate(inputs):
264
print >> sio, "\t,", "GLOBAL_MEM const %s * i%i_data" % (
265
dtype_to_ctype(i.dtype), ipos)
267
for ipos, i in enumerate(outputs):
268
print >> sio, "\t,", "GLOBAL_MEM %s * o%i_data" % (
269
dtype_to_ctype(i.dtype), ipos)
270
print >> sio, "\t)\n{"
271
print >> sio, " const int idx = GID_0 * LDIM_0 + LID_0;"
272
print >> sio, " const int numThreads = LDIM_0 * GDIM_0;"
274
# For each input that is a scalar which has been broadcasted
275
# to a tensor, load it into a local variable
276
for ipos, i in enumerate(inputs):
277
if _logical_scalar(i):
278
print >> sio, " const %s ii_i%i_value = i%i_data[0];" % (
279
dtype_to_ctype(i.dtype), ipos, ipos)
281
#loop over the elements to be treated by this kernel call
282
print >> sio, " for (int i = idx; i < numEls; i += numThreads) {"
283
# perform the scalar operation on the input and output references
284
#TODO: What if the scalar_op needs support_code??
285
self.task_code(inputs, outputs, sio, nodename,
286
iname=get_str_list_logical_scalar(
287
inputs, data_str='i%i_data[i]'),
288
oname=['o%i_data[i]' % ipos
289
for ipos, i in enumerate(outputs)])
293
#print sio.getvalue()
294
return sio.getvalue()
296
def c_src_callkernel(self, inputs, outputs, nodename):
298
# This function serves three main goals:
300
# The first is stride unpacking:
301
# it accepts input and output arguments as
303
# pairs, and it constructs a kernel function call where inputs
304
# and arguments are named like
305
# float *, int, int, int ...
307
# The second is to recognize when any dimensions can be collapsed as
308
# being contiguous. That mean that we can merge that dimensions with
309
# another one for all inputs/outputs and have the same retusuls
310
# (confusing... read code)
312
# The thrid is to make a special case for scalar element. We allow
313
# the collapsing of them. In the ccontiguous and not contiguous case,
314
# we use registers to lower the number of memory access.
316
# TODO: make a special case for broadcasting, to store the
317
# data in shared memory.
319
nd = outputs[0].type.ndim
320
nb_inputs = len(inputs)
321
nb_outputs = len(outputs)
323
# input_params and output_params go into the function
324
# declaration/definition
325
input_params = ", ".join("const %s * i%i_data, const int * i%i_str" % (
326
dtype_to_ctype(inputs[i].dtype), ipos, ipos)
327
for ipos in xrange(len(inputs)))
328
output_params = ", ".join("%s * o%i_data, const int * o%i_str" % (
329
dtype_to_ctype(outputs[i].dtype),
331
for ipos in xrange(len(outputs)))
333
#input_args and output_args go into the recursive call.
334
input_args = ", ".join("i%i_data, i%i_str" % (ipos, ipos)
335
for ipos in xrange(len(inputs)))
336
output_args = ", ".join("o%i_data, o%i_str" % (ipos, ipos)
337
for ipos in xrange(len(outputs)))
339
prod_dims = '*'.join(["dims[%i]" % di for di in xrange(nd)] + ['1'])
341
sio = StringIO.StringIO()
343
static void can_collapse_%(nodename)s(int nd, const int * dims,
347
//can we collapse dims[i] and dims[i-1]
348
for(int i=nd-1;i>0;i--){
349
if(strides[i]*dims[i]==strides[i-1]){
350
//the dims nd-1 are not strided again dimension nd
357
static int callkernel_%(nodename)s(unsigned int numEls, const int d,
362
numEls = %(prod_dims)s;
366
std::cerr << "calling kernel_%(nodename)s w numEls" << numEls << " dims"<< d << "\\n";
368
print >> sio, 'std::cerr << ' + " << ' ' << ".join(['" "']+list("dims[%i]"%di
369
for di in xrange(nd)) + ["'\\n';"])
371
for ipos in xrange(len(inputs)):
373
std::cerr << " %(ipos)s data strides" <<
374
""" % locals() + " << ' ' << ".join(["i%s_data" % ipos]
375
+ list("i%s_str[%i]" % (ipos, di)
376
for di in xrange(nd))) + ''' << "\\n"; '''
378
for ipos in xrange(len(outputs)):
380
std::cerr << " %(ipos)s data strides" <<
381
""" % locals() + " << ' ' << ".join(["o%s_data" % ipos]
382
+ list("o%s_str[%i]" % (ipos, di)
383
for di in xrange(nd))) + ''' << "\\n"; '''
384
# collapse dimension that are broadcast in all inputs.
385
# need to be done before contiguous collapse as it will break it.
386
# do the dimensions and the strides
388
int local_dims[%(nd)s];
389
int local_str[%(nb_inputs)s][%(nd)s];
390
int local_ostr[%(nb_inputs)s][%(nd)s];
391
int nd_collapse = %(nd)s;
392
for(int i=0;i<%(nd)s;i++){//init new dim
393
local_dims[i]=dims[i];
396
for ipos in xrange(len(inputs)):
398
for(int i=0;i<%(nd)s;i++){//init new strides
399
local_str[%(ipos)s][i]=i%(ipos)s_str[i];
402
for ipos in xrange(len(outputs)):
404
for(int i=0;i<%(nd)s;i++){//init new strides
405
local_ostr[%(ipos)s][i]=o%(ipos)s_str[i];
409
print >>sio, 'std::cerr <<"before broadcast collapse\\n";'
410
print >>sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; '
411
print >> sio, 'std::cerr << "local_dims";'
413
print >> sio, 'std::cerr << " " << local_dims[%(d)s]; ' % locals()
414
print >> sio, 'std::cerr << "\\n";'
416
for ipos in xrange(len(inputs)):
417
print >> sio, 'std::cerr << " local_str inputs %(ipos)s: " <<' % locals()+' << " " << '.join(["local_str[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'
418
for ipos in xrange(len(outputs)):
419
print >> sio, 'std::cerr << " local_ostr inputs %(ipos)s: " <<' % locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'
422
for(int id=0;id<nd_collapse;id++){
424
bool all_broadcast=true;
425
for(int input_id=0;input_id<%(nb_inputs)s;input_id++){
426
if(local_str[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false;
428
for(int input_id=0;input_id<%(nb_outputs)s;input_id++){
429
if(local_ostr[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false;
432
for(int j=id+1;j<nd_collapse;j++)//remove dims i from the array
433
local_dims[j-1]=local_dims[j];
434
for(int input_id=0;input_id<%(nb_inputs)s;input_id++){
435
for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array
436
local_str[input_id][j-1]=local_str[input_id][j];
439
for(int output_id=0;output_id<%(nb_outputs)s;output_id++){
440
for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array
441
local_ostr[output_id][j-1]=local_ostr[output_id][j];
450
print >>sio, 'std::cerr <<"after broadcast collapse\\n";'
451
print >>sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; '
452
print >> sio, 'std::cerr << "local_dims";'
454
print >> sio, 'std::cerr << " " << local_dims[%(d)s]; ' % locals()
455
print >> sio, 'std::cerr << "\\n";'
457
for ipos in xrange(len(inputs)):
458
print >> sio, 'std::cerr << " local_str %(ipos)s: " <<' % locals()+' << " " << '.join(["local_str[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'
459
for ipos in xrange(len(outputs)):
460
print >> sio, 'std::cerr << " local_ostr %(ipos)s: " <<' % locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'
461
# collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle))
462
# this is a good idea because we make less index calculation in the gpu.
464
print >> sio, "int nd_collapse_[%(nd)s] = {" % locals() +','.join(['1' for x in range(nd)]) +"};"
465
for ipos in xrange(len(inputs)):
466
if not _logical_scalar(inputs[ipos]):
468
int nd_collapse_%(ipos)s[%(nd)s] = {""" % locals() +','.join(['1' for x in range(nd)]) +"};"
470
can_collapse_%(nodename)s(nd_collapse, local_dims, local_str[%(ipos)s], nd_collapse_%(ipos)s);
471
for(int i=0;i<nd_collapse;i++){
472
if(nd_collapse_%(ipos)s[i]==0)
478
std::cerr<< "nd_collapse_%(ipos)s "<<
480
print >>sio, ' << " " << '.join(
481
["nd_collapse_%(ipos)s[" % locals() + str(i) + "]"
483
print >>sio, '<< "\\n";'
485
std::cerr<< "nd_collapse_ "<<
487
print >>sio, ' << " " << '.join(
488
["nd_collapse_[" % locals() + str(i) + "]"
490
print >>sio, '<< "\\n";'
492
# update the local stride.
493
for ipos in xrange(len(inputs)):
495
for(int i=nd_collapse-1;i>0;i--){
496
if(nd_collapse_[i]==1){
497
local_str[%(ipos)s][i-1]=local_str[%(ipos)s][i];//set new strides
498
for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array
499
local_str[%(ipos)s][j-1]=local_str[%(ipos)s][j];
504
for ipos in xrange(len(outputs)):
506
for(int i=nd_collapse-1;i>0;i--){
507
if(nd_collapse_[i]==1){
508
local_ostr[%(ipos)s][i-1]=local_ostr[%(ipos)s][i];//set new strides
509
for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array
510
local_ostr[%(ipos)s][j-1]=local_ostr[%(ipos)s][j];
515
# update the local dims.
517
for(int i=nd_collapse-1;i>0;i--){
518
if(nd_collapse_[i]==1){
519
local_dims[i-1]*=local_dims[i];//set new dims
520
for(int j=i+1;j<nd_collapse;j++)//remove dims i from the array
521
local_dims[j-1]=local_dims[j];
526
#update the new number of dim
528
for(int i=1, end=nd_collapse;i<end;i++){
529
if(nd_collapse_[i]==1)nd_collapse--;
531
if(nd_collapse == 1 """ % locals()
532
l = ["local_str[%(ipos)s][nd_collapse-1]==1 " % locals()
533
for ipos in range(len(inputs))
534
if not _logical_scalar(inputs[ipos])]
535
l += ["local_ostr[%(ipos)s][nd_collapse-1]==1 " % locals()
536
for ipos in range(len(outputs))
537
if not _logical_scalar(outputs[ipos])]
539
print >> sio, " && ", " && ".join(l)
540
print >> sio, """){nd_collapse=0;} """
543
print >> sio, 'std::cerr <<"after can_collapse\\n";'
544
print >> sio, """std::cerr << "nd_collapse " << nd_collapse << "\\n"; """ % locals()
547
print >> sio, 'std::cerr << " " << local_dims[%(d)s]; ' % locals()
548
print >> sio, 'std::cerr << "\\n";'
550
for ipos in xrange(len(inputs)):
551
print >> sio, ('std::cerr << " local_str %(ipos)s: " <<' %
552
locals() + ' << " " << '.join(
553
["local_str[%(ipos)s][%(x)s]" % locals()
554
for x in range(nd)]) + '<<"\\n";')
555
for ipos in xrange(len(outputs)):
556
print >> sio, ('std::cerr << " local_ostr %(ipos)s: " <<' %
557
locals() + ' << " " << '.join(
558
["local_ostr[%(ipos)s][%(x)s]" % locals()
559
for x in range(nd)]) + '<<"\\n";')
561
def launch_Ccontiguous(nodename, scalar_op):
562
kernel_call_args = ["numEls"]
563
for ipos in xrange(len(inputs)):
564
kernel_call_args.append("i%i_data" % ipos)
565
for ipos in xrange(len(outputs)):
566
kernel_call_args.append("o%i_data" % ipos)
567
kernel_call_args = ", ".join(kernel_call_args)
570
verb = 'std::cerr << " Running ccontiguous version\\n";'
572
//first use at least a full warp
573
int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE
575
//next start adding multiprocessors
576
int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS
578
// next start adding more warps per multiprocessor
579
if (threads_per_block * n_blocks < numEls)
580
threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
581
kernel_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
583
//std::cerr << "calling callkernel returned\\n";
588
cudaError_t err = cudaGetLastError();
589
if( cudaSuccess != err)
591
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n n_blocks=%%i threads_per_block=%%i\\n Call: %%s\\n",
592
"GpuElemwise %(nodename)s", cudaGetErrorString(err),
593
n_blocks, threads_per_block,
594
"kernel_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s)");
602
def launch_General(nodename, scalar_op, force_nd):
603
# kernel_call_args are used to invoke the cuda kernel
605
kernel_call_args = ["numEls"]
606
kernel_call_args.extend(local + "dims[%i]" % di
607
for di in xrange(force_nd))
608
for ipos in xrange(len(inputs)):
609
kernel_call_args += ["i%i_data" % ipos] + list(
610
local + "str[%i][%i]" % (ipos, di)
611
for di in xrange(force_nd))
612
#strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
613
#kernel_call_args.append( "%s, i%i_data" % (strides, ipos))
614
for ipos in xrange(len(outputs)):
615
kernel_call_args += ["o%i_data" % ipos] + list(
616
local + "ostr[%i][%i]" % (ipos, di)
617
for di in xrange(force_nd))
618
#strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
619
#kernel_call_args.append( "%s, o%i_data" % (strides, ipos))
622
std::cerr << " Running general version with %(force_nd)s dims\\n";
624
print >> sio, "std::cerr << "+ ' << " " << '.join(
625
kernel_call_args)+' << "\\n";'
626
#std::cerr << numEls << dims[0] << i0_data, i0_str[0] << o0_data, o0_str[0]\n;
628
kernel_call_args = ", ".join(kernel_call_args)
631
//first use at least a full warp
632
int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE
634
//next start adding multiprocessors
635
int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS
637
// next start adding more warps per multiprocessor
638
if (threads_per_block * n_blocks < numEls)
639
threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
641
kernel_%(nodename)s_%(force_nd)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
645
cudaError_t err = cudaGetLastError();
646
if( cudaSuccess != err)
648
PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n n_blocks=%%i threads_per_block=%%i\\n Call: %%s\\n",
649
"GpuElemwise %(nodename)s", cudaGetErrorString(err),
650
n_blocks, threads_per_block,
651
"kernel_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s)");
658
print >> sio, "if(numEls==0) return 0;"
659
print >> sio, "switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {"%locals()
660
print >> sio, "case 0: {"
661
launch_Ccontiguous(nodename, scalar_op)
662
print >> sio, " } break;"
663
for i in range(1, nd + 1):
664
print >> sio, "case " + str(i) + ": {"
665
launch_General(nodename, scalar_op, i)
666
print >> sio, " } break;"
668
print >> sio, "}" # end case
669
print >> sio, "return -2;" # should not get to this point
670
print >> sio, "}" # end fct
672
#N.B. cudaGetLastError is called by c_code
673
return sio.getvalue()
675
def c_support_code_apply(self, inputs, outputs, nodename):
676
nd = outputs[0].type.ndim
679
[self.c_src_kernel(inputs, outputs, nodename, x)
680
for x in range(1, nd + 1)] +
681
[self.c_src_kernel_Ccontiguous(inputs, outputs, nodename),
682
self.c_src_callkernel(inputs, outputs, nodename),
685
def c_code(self, ninputs, noutputs, nodename, inputs, outputs, sub):
687
nd = noutputs[0].type.ndim
689
sio = StringIO.StringIO()
693
opname = str(self.scalar_op)
694
initial_dims = ','.join('1' for i in xrange(nd))
695
if 1 or self.scalar_op == scalar.pow:
697
//std::cerr << "C_CODE %(opname)s START\\n";
698
//standard elemwise size checks
701
int dims[%(nd)s] = {%(initial_dims)s};
704
#check that all inputs have valid dimensions
706
for id, iname in enumerate(inputs):
707
if iname in emitted_inames:
708
assert emitted_inames[iname] is ninputs[id]
710
broadcasts = ', '.join(map(str, map(int,
711
ninputs[id].broadcastable)))
712
nd = ninputs[id].ndim
714
int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s};
716
emitted_inames[iname] = ninputs[id]
717
#check that all inputs have valid dimensions
719
for id, iname in enumerate(inputs):
720
if iname in emitted_inames:
723
//std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";
724
if (%(nd)s != %(iname)s->nd)
726
PyErr_Format(PyExc_TypeError, "need %(nd)s dims, not %%i", %(iname)s->nd);
729
for (int i = 0; i< %(nd)s; ++i)
731
dims[i] = (dims[i] == 1) ? CudaNdarray_HOST_DIMS(%(iname)s)[i] : dims[i];
732
if ((!(broadcasts_%(iname)s[i] && CudaNdarray_HOST_DIMS(%(iname)s)[i] == 1))&& (dims[i] != CudaNdarray_HOST_DIMS(%(iname)s)[i]))
734
//std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
735
PyErr_Format(PyExc_ValueError, "GpuElemwise. Input dimension mis-match. One of your inputs has shape[%%i] == %%i, but the output's size on that axis is %%i.",
737
CudaNdarray_HOST_DIMS(%(iname)s)[i],
744
emitted_inames[iname] = True
746
#check that all outputs have valid dimensions
747
for idx, oname in enumerate(outputs):
748
if idx not in self.inplace_pattern.keys():
750
for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
751
if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i])
753
Py_DECREF(%(oname)s);
757
if (NULL == %(oname)s)
759
%(oname)s = (CudaNdarray*)CudaNdarray_New();
762
//error string already set
765
if (CudaNdarray_alloc_contiguous(%(oname)s, %(nd)s, dims))
767
//error string already set
768
Py_DECREF(%(oname)s);
773
//std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n";
774
//std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
777
input_idx = self.inplace_pattern[idx]
778
iname = inputs[input_idx]
780
Py_XDECREF(%(oname)s);
781
%(oname)s = %(iname)s;
782
Py_INCREF(%(oname)s);
783
for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
784
if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i])
786
Py_DECREF(%(oname)s);
791
//std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n";
792
//std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
797
//new block so that failure gotos don't skip over variable initialization
798
//std::cerr << "calling callkernel\\n";
799
if (callkernel_%(nodename)s(1, 0, dims
803
, CudaNdarray_DEV_DATA(%(iname)s), CudaNdarray_HOST_STRIDES(%(iname)s)
805
for oname in outputs:
807
, CudaNdarray_DEV_DATA(%(oname)s), CudaNdarray_HOST_STRIDES(%(oname)s)
814
for oname in outputs:
816
Py_DECREF(%(oname)s);
826
//std::cerr << "C_CODE %(opname)s END\\n";
828
#print sio.getvalue()
829
return sio.getvalue()
831
def c_support_code(self):
833
#define INTDIV_POW2(a, b) (a >> b)
834
#define INTMOD_POW2(a, b) (a & ((1<<b)-1))
837
def dummy_holder_for_code_not_used():
839
def c_src_kernel_tiling(self, inputs, outputs, nodename):
840
""" The kernel applies to problems with <= 5 dimensions """
842
#The kernel is intended to be structured roughly like this:
844
static __global__ void kernel()
846
for (int v = blockIdx.y; v < dim0; v += gridDim.x)
848
for (int w = blockIdx.y; w < dim1; w += gridDim.y)
850
for (int x = threadIdx.x; x < dim2; x += blockDim.x)
852
for (int y = threadIdx.y; y < dim3; y += blockDim.y)
854
for (int z = threadIdx.z; z < dim4; z += blockDim.z)
856
out[v * out_stride[0] + ...] = f(in1[...], in2[...])
866
nd = outputs[0].type.ndim
867
sio = StringIO.StringIO()
868
#print 'C_SRC_KERNEL', sio.getvalue()
871
# print some leading comments to make the code easier to read
872
for ipos, i in enumerate(inputs):
873
print >> sio, "// Input ", ipos, str(i.type)
874
for ipos, i in enumerate(outputs):
875
print >> sio, "// Output ", ipos, str(i.type)
876
print >> sio, """static __global__ void kernel_%s_%s(
877
unsigned int numEls""" % (
881
print >> sio, "\t,", ", ".join("const int dim%i" % i
884
for ipos, i in enumerate(inputs):
885
s = ", ".join(["const float * i%i_data" % ipos] + list(
886
"int i%i_str_%i" % (ipos, d) for d in xrange(nd)))
887
print >> sio, "\t,", s
889
for ipos, i in enumerate(outputs):
890
s = ", ".join(["float * o%i_data" % ipos] + list(
891
"int o%i_str_%i" % (ipos, d) for d in xrange(nd)))
892
print >> sio, "\t,", s
893
#print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
894
#print >> sio, "\t,", "float * o%i_data" % ipos
895
print >> sio, "\t)\n{"
897
# For each input that is a scalar which has been broadcasted to a tensor,
898
# load it into a local variable
899
print >> sio, " __shared__ float value0[%i];" % len(inputs)
900
print >> sio, " __shared__ int shared_dims[%(nd)s];" % locals()
901
#print >> sio, " __shared__ int shared_i_str[%(n_in)s][%(nd)s]"
902
print >> sio, " if ((threadIdx.x == 0) && (threadIdx.y == 0)) {"
903
for ipos, i in enumerate(inputs):
904
if _logical_scalar(i):
905
print >> sio, " value0[%i] = i%i_data[0];" % (ipos,
907
for ipos in xrange(nd):
908
print >> sio, " shared_dims[%i] = dim%i;" % (ipos, ipos)
910
print >> sio, " __syncthreads();"
914
for (int pos0 = blockIdx.x; pos0 < shared_dims[0]; pos0 += gridDim.x)
916
for (int pos1 = blockIdx.y; pos1 < shared_dims[1]; pos1 += gridDim.y)
918
//for (int pos2 = threadIdx.x; pos2 < shared_dims[2]; pos2 += blockDim.x)
919
for (int pos2 = threadIdx.y; pos2 < shared_dims[2]; pos2 += blockDim.y)
921
//for (int pos3 = threadIdx.y; pos3 < shared_dims[3]; pos3 += blockDim.y)
922
for (int pos3 = threadIdx.x; pos3 < shared_dims[3]; pos3 += blockDim.x)
926
raise NotImplementedError()
928
for ipos, i in enumerate(inputs):
929
if not _logical_scalar(i):
930
print >> sio, " const float * ii_i%i_data = i%i_data;" % (ipos, ipos)
931
for ipos, i in enumerate(outputs):
932
print >> sio, " float * ii_o%i_data = o%i_data;" % (ipos, ipos)
934
for ipos, i in enumerate(inputs):
935
if not _logical_scalar(i):
936
print >> sio, " ii_i%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d)
937
for ipos, i in enumerate(outputs):
938
print >> sio, " ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d)
940
# perform the scalar operation on the input and output references
941
#TODO: What if the scalar_op needs support_code??
942
self.task_code(inputs, outputs, sio, nodename,
943
iname=get_str_list_logical_scalar(
944
inputs, value_str='value0[%i]'))
945
print >> sio, " }" * nd
947
#TODO: insert runtime stride checks that select the best loop order either here, or in
948
# the host code that launched the kernel (host code probably better spot)
950
#indent = " "*(4*d+7)
951
#for ipos, i in enumerate(inputs):
952
#print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
956
return sio.getvalue()
958
def c_src_kernel_tiling_less_registers(self, inputs, outputs, nodename):
959
""" The kernel applies to problems with <= 5 dimensions """
961
nd = outputs[0].type.ndim
964
sio = StringIO.StringIO()
967
return sio.getvalue()
969
# print some leading comments to make the code easier to read
970
for ipos, i in enumerate(inputs):
971
print >> sio, "// Input ", ipos, str(i.type)
972
for ipos, i in enumerate(outputs):
973
print >> sio, "// Output ", ipos, str(i.type)
974
print >> sio, "static __global__ void kernel_%s_%s(unsigned int numEls" %(
976
'tiling%i_less_registers'%nd)
978
print >> sio, "\t,", ", ".join("const int dim%i" % i
981
for ipos, i in enumerate(inputs):
982
s = ", ".join(["const float * i%i_data_0" % ipos] + list(
983
"int i%i_str_%i" % (ipos, d) for d in xrange(nd)))
984
print >> sio, "\t,", s
986
for ipos, i in enumerate(outputs):
987
s = ", ".join(["float * o%i_data_0" % ipos] + list(
988
"int o%i_str_%i" % (ipos, d) for d in xrange(nd)))
989
print >> sio, "\t,", s
990
#print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
991
#print >> sio, "\t,", "float * o%i_data" % ipos
992
print >> sio, "\t)\n{"
994
# TODO: Setting these to true makes the function fail SOMETIMES. I don't know why yet.
995
use_shared_stride = False
996
use_shared_limits = False
999
if use_shared_limits:
1000
print >> sio, "__shared__ float * limits[%(nd)s];" % locals()
1002
def stride(io, p, d):
1003
if use_shared_stride:
1004
return "s%s_str[%i][%i]" % (io, p, d)
1006
return "%s%i_str_%i" % (io, p, d)
1009
if use_shared_limits:
1010
return "limits[%i]" % d
1012
return "limits%i" % d
1014
def decl_shared_stride(nin, nout, nd):
1015
if not use_shared_stride:
1018
__shared__ int si_str[%(nin)s][%(nd)s];
1019
__shared__ int so_str[%(nout)s][%(nd)s];
1020
if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
1022
for i in xrange(nin):
1023
for d in xrange(nd):
1024
print >> sio, "si_str[%(i)s][%(d)s] = i%(i)s_str_%(d)s;" % locals()
1025
for i in xrange(n_out):
1026
for d in xrange(nd):
1027
print >> sio, "so_str[%(i)s][%(d)s] = o%(i)s_str_%(d)s;" % locals()
1028
print >> sio, "} __syncthreads();"
1031
s = stride('o', 0, d)
1033
if use_shared_limits:
1034
print >> sio, "if ((threadIdx.x == 0) && (threadIdx.y == 0)) {"
1036
print >> sio, "%(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals()
1039
print >> sio, "%(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals()
1040
print >> sio, "} __syncthreads();"
1043
print >> sio, "const float * %(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals()
1046
print >> sio, "const float * %(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals()
1048
def decl_ptrs(d, offset):
1051
for i in xrange(n_in):
1052
s = stride('i', i, d)
1053
print >> sio, "const float * i%(i)s_data_%(d)s = i%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals()
1054
for i in xrange(n_out):
1055
s = stride('o', i, d)
1056
print >> sio, "float * o%(i)s_data_%(d)s = o%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals()
1058
def inc_ptrs(d, amt):
1059
for i in xrange(n_in):
1060
s = stride('i', i, d)
1061
print >> sio, "i%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals()
1062
for i in xrange(n_out):
1063
s = stride('o', i, d)
1064
print >> sio, "o%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals()
1068
print >> sio, "while (o0_data_%(d)s < %(lname)s) { " % locals()
1074
self.task_code(inputs, outputs, sio, nodename,
1075
iname=['i%i_data_%i[0]' % (ipos, d)
1076
for ipos, i in enumerate(inputs)],
1077
oname=['o%i_data_%i[0]' % (ipos, d)
1078
for ipos, i in enumerate(outputs)])
1081
decl_shared_stride(n_in, n_out, nd)
1084
inc_ptrs(0, 'blockIdx.x')
1088
decl_ptrs(1, 'blockIdx.y')
1092
decl_ptrs(2, 'threadIdx.y')
1096
decl_ptrs(3, 'threadIdx.x')
1100
inc_ptrs(3, 'blockDim.x')
1102
inc_ptrs(2, 'blockDim.y')
1104
inc_ptrs(1, 'gridDim.y')
1106
inc_ptrs(0, 'gridDim.x')
1110
print sio.getvalue()
1111
return sio.getvalue()
1114
def elemwise_collapses(inputs, outputs, out_shape=None, verbose=0):
1116
This collapse dimensions that are not needed when computing
1117
elemwise. This is usefull as it lower the indexing computation
1118
that is heavier on gpu then on cpu.
1120
This is a generic version. It collapse dimensions at any place in
1121
the shape. It handle broadcasted dimensions correctly.
1123
There is no special handling needed for broadcasted scalar at this level.
1125
@return: ndims, tuple(dims, strides) after collapsing.
1127
in_out = inputs + outputs
1129
if out_shape is not None:
1130
local_dims = tuple(out_shape)
1132
# TODO, use the right algo here or make the parameter not optional
1133
# We should always have the same shape for all outputs
1134
# If there is more then one outputs
1135
local_dims = tuple(outputs[0].shape)
1137
nd_orig = len(local_dims)
1139
# This have a lower overhead
1142
if not inp.flags['C_CONTIGUOUS'] or inp.shape != local_dims:
1143
all_c_contig = False
1146
return 0, (local_dims, [])
1148
collapsable = [1] * nd_orig
1150
local_str = [None] * len(in_out)
1151
nd_collapse = nd_orig
1152
for ipos in xrange(len(in_out)):
1154
assert len(inp.shape) == nd_orig, "All inputs/outputs must have the same number of dimensions. You must broadcast before calling elemwise_collapse"
1155
local_str[ipos] = list(inp.strides)
1156
# We set the strides of broacastable dims to 0
1157
# This make indexing in gpu simpler and is needed
1158
# For collapsing the dimensions.
1159
for dim_pos in range(inp.ndim):
1160
if inp.shape[dim_pos] == 1:
1161
local_str[ipos][dim_pos] = 0
1164
# We already covered the contiguous case before
1165
# So we are sure it is not contiguous
1166
# TODO: Add a test that f contiguous are also collapsed by the first case.
1167
# I think that for 1d array when the flags f contiguous is true, c contiguous is also true.
1168
return 1, (local_dims, local_str)
1171
print "before broadcast collapse"
1172
print " nd_collapse", nd_collapse
1173
print " local_dims", local_dims
1174
for ipos in xrange(len(local_str)):
1175
print " local_str inputs", ipos, local_str[ipos]
1176
local_dims = list(local_dims)
1177
# Collapse dimension that are broadcast in all inputs.
1178
# need to be done before contiguous collapse as it will break it.
1179
# Update the dimensions and the strides
1180
for id in range(nd_collapse):
1181
if local_dims[id] == 1:
1182
# remove dims i from the array
1183
for j in range(id + 1, nd_collapse):
1184
local_dims[j - 1] = local_dims[j]
1185
# remove dims i from the array
1186
for input_id in range(len(in_out)):
1187
for j in range(id + 1, nd_collapse):
1188
local_str[input_id][j - 1] = local_str[input_id][j]
1190
id -= 1 # TODO: what is this? How this work?
1193
print "after broadcast collapse"
1194
print " nd_collapse", nd_collapse
1195
print " local_dims", local_dims
1196
for ipos in xrange(len(local_str)):
1197
print " local_str inputs", ipos, local_str[ipos]
1199
nd_collapse_ = [1] * nd_orig
1200
for ipos in xrange(len(local_str)):
1201
# Can we collapse dims[i] and dims[i-1]?
1202
strides = local_str[ipos]
1203
for i in range(nd_collapse - 1, 0, -1):
1204
if strides[i] * local_dims[i] != strides[i - 1]:
1205
# The dims nd-1 are not strided again dimension nd
1209
print "nd_collapse_", nd_collapse_
1211
nd_collapse2 = nd_collapse
1212
for i in range(nd_collapse - 1, 0, -1):
1213
if nd_collapse_[i] == 1:
1214
# update the local dims.
1215
local_dims[i - 1] *= local_dims[i]
1216
for j in range(i + 1, nd_collapse):
1217
local_dims[j - 1] = local_dims[j]
1219
# update the local stride.
1220
for ipos in xrange(len(local_str)):
1221
local_str[ipos][i - 1] = local_str[ipos][i] # set new strides
1222
# remove stride i from the array
1223
for j in range(i + 1, nd_collapse):
1224
local_str[ipos][j - 1] = local_str[ipos][j]
1226
# update the new number of dim
1228
nd_collapse = nd_collapse2
1230
if nd_collapse == 1:
1231
l = [local_str[ipos][nd_collapse - 1] == in_out[ipos].itemsize
1232
for ipos in range(len(local_str))]
1237
print "end collapsing"
1238
print " nd_collapse", nd_collapse
1240
print " local_dims", local_dims
1241
for ipos in xrange(len(local_str)):
1242
print " local_str inputs", ipos, local_str[ipos]
1244
return nd_collapse, (local_dims, local_str)
1247
def reduction_collapses(inout, axis, verbose=0):
1249
This collapse dimensions that are not needed when computing
1250
reduction. This is usefull as it lower the indexing computation
1251
that is heavier on gpu then on cpu.
1253
This is a generic version. It collapse dimensions at any place in
1255
@param: inout: tuple(input, output)
1256
@param: axis: None, interger, list of 1 interger
1257
The axis over witch we will do reduction.
1258
@return: (ndims, (input dims, input strides, input pattern), out strides)
1261
:note: we suppose that we can always collapse the output dimensions.
1265
# Some quick check. It is faster then the full version.
1267
# The output size is always 1, so we don't care about this strides
1268
if (input.flags['C_CONTIGUOUS'] or input.flags['F_CONTIGUOUS']):
1269
return 0, ((input.size,), (input.itemsize,), axis), (0,)
1271
assert axis == [0] or axis == 0 or axis is None
1272
# not c contiguous as the first if should have catched it.
1273
return 1, (input.shape, input.strides, axis), (0,)
1275
if not isinstance(axis, (list, tuple)):
1278
local_axis = list(axis)
1280
# This is needed for the computing of the output strides
1281
assert axis is None or len(local_axis) == 1
1283
local_dims = list(input.shape)
1284
local_str = list(input.strides)
1285
out_strides = list(out.strides)
1287
nd_orig = len(local_dims)
1288
collapsable = [1] * nd_orig
1289
nd_collapse = nd_orig
1292
print "before broadcast collapse"
1293
print " nd_collapse", nd_collapse
1294
print " local_dims", local_dims
1295
print " local_str inputs", local_str
1296
print " local_axis", local_axis
1298
# Collapse dimension that are broadcast in all inputs.
1299
# need to be done before contiguous collapse as it will break it.
1300
# Update the dimensions and the strides
1301
for id in range(nd_collapse):
1302
if local_dims[id] == 1:
1303
for j in range(id + 1, nd_collapse):
1304
# remove dims i from the array
1305
local_dims[j - 1] = local_dims[j]
1306
# remove strides i from the array
1307
local_str[j - 1] = local_str[j]
1308
# remove output strides i from the array
1309
if axis is not None:
1310
out_strides[j - 2] = out_strides[j - 1]
1311
if id in local_axis:
1312
local_axis.remove(id)
1313
for axis_pos in range(len(local_axis)):
1314
if local_axis[axis_pos] > id:
1315
local_axis[axis_pos] -= 1
1318
id -= 1 # TODO: how this work?
1321
print "after broadcast collapse"
1322
print " nd_collapse", nd_collapse
1323
print " local_dims", local_dims
1324
print " local_str inputs", local_str
1325
print " local_axis", local_axis
1326
print " out_strides", out_strides
1328
nd_collapse_ = [1] * nd_orig
1329
# Can we collapse dims[i] and dims[i-1]?
1330
for i in range(nd_collapse - 1, 0, -1):
1331
if ((local_str[i] * local_dims[i] != local_str[i - 1])):
1332
# The dims nd-1 are not strided again dimension nd
1334
elif (i in local_axis) != ((i - 1) in local_axis):
1338
print "nd_collapse_", nd_collapse_
1340
nd_collapse2 = nd_collapse
1341
for i in range(nd_collapse - 1, 0, -1):
1342
if nd_collapse_[i] == 1:
1343
# update the local dims.
1344
local_dims[i - 1] *= local_dims[i]
1346
local_str[i - 1] = local_str[i]
1347
#remove the old dims and strides
1348
for j in range(i + 1, nd_collapse):
1349
local_dims[j - 1] = local_dims[j]
1350
local_str[j - 1] = local_str[j]
1351
if axis is not None:
1352
out_strides[j - 2] = out_strides[j - 1]
1355
local_axis.remove(i)
1356
for axis_pos in range(len(local_axis)):
1357
if local_axis[axis_pos] > i:
1358
local_axis[axis_pos] -= 1
1360
# update the new number of dim
1363
nd_collapse = nd_collapse2
1365
if nd_collapse == 1:
1366
if local_str[nd_collapse - 1] == input.itemsize:
1370
print "end collapsing"
1371
print " nd_collapse", nd_collapse
1373
print " local_dims", local_dims
1374
print " local_str inputs", local_str
1375
print " local_axis", local_axis
1376
print " out_strides", out_strides
1378
#print input.shape, input.strides
1379
#print nd_collapse, (local_dims, local_str, local_axis)
1380
local_dims = local_dims[:nd_collapse]
1381
local_str = local_str[:nd_collapse]
1382
out_strides = out_strides[:nd_collapse]
1383
return nd_collapse, (local_dims, local_str, local_axis), out_strides
1386
def call_elemwise(fct, input_vals, block=None, grid=None, out=None,
1389
""" Call an elemwise gpu function with gived inputs and block size.
1391
:param fct: The gpu function to call
1392
:param input_vals: a list of inputs to pass to fct
1393
:param block: int, the size of the block wanted
1394
:param grid: int, the size of the grid wanted
1395
:param out: Optional, the preallocated output. Must have the right shape
1398
:param out_shape: Optional, if provided, we will suppose that the output,
1399
have this shape event if it is not true.
1400
:param strides: Optional, if provided, we will use those strides for
1401
the inputs and outputs.
1403
:note: param out_shape and strides are used for the collapsing of
1408
# Get the output and output shape to us
1409
if out_shape is None and out is None:
1410
out_shape = list(inp.shape)
1411
for i in input_vals[1:]:
1412
# dtype checked by pycuda before gpu call
1413
for s_i in range(len(inp.shape)):
1414
assert (inp.shape[s_i] == i.shape[s_i]
1415
or inp.shape[s_i] == 1
1416
or i.shape[s_i] == 1)
1417
out_shape[s_i] = max(out_shape[s_i], inp.shape[s_i],
1420
out = gpu_ndarray.empty(out_shape, dtype=inp.dtype)
1421
elif out_shape is None:
1422
out_shape = out.shape
1425
args = [cast_uint(out.size)]
1426
# Arg: output shape to the arguments.
1427
for i in range(len(out_shape)):
1428
args.append(cast_int(out_shape[i]))
1430
# for each inputs and the output
1431
# add its ptr and strides
1434
for i in list(input_vals) + [out]:
1435
itemsize = i.dtype.itemsize
1438
# We force a stride of 0 for broadcastable dimensions
1439
# This lower the index computation in the kernel.
1440
if strides is not None:
1441
# strides should have a strides of 0 for broadcasting.
1442
args.append(cast_int(strides[idx][j] / itemsize))
1443
elif i.shape[j] == 1:
1444
args.append(cast_int(0))
1446
args.append(cast_int(i.strides[j] / itemsize))
1449
# First use at least a full warp
1451
block_ = min(32, out_size)
1454
# Next start adding multiprocessors
1456
grid_ = min(out_size / block_ + (out_size % block_ != 0), 60)
1459
# Next start adding more warps per multiprocessor
1461
if block_ * grid_ < out_size:
1462
block_ = min(out_size / grid_, 512)
1464
# We bypass the pycuda wrapper gpu function call.
1465
# by calling directly the gpu function.
1466
# This is faster and lower the overhead.
1467
# Here is code that allow you to use the pycuda fct call.
1468
# d = {"block":(block_,1,1), "grid":(grid_,1)}
1470
fct.set_block_shape(block_, 1, 1) # time_kernel
1471
fct.param_set(*args)
1472
fct.launch_grid(grid_, 1)
1476
class MyGpuNdArray():
1479
def __init__(self, gpu_nd_array):
1480
#assert isinstance(gpu_nd_array, gpu_ndarray.GpuNdArrayObject)
1481
self.gpu_nd_array = gpu_nd_array
1482
self.ctype = dtype_to_ctype(self.gpu_nd_array.dtype)
1485
def gen_fct(op, inputs, nd, nodename="TestNodeName",
1488
npy_ty = "typedef float npy_float32;\n"
1490
npy_ty = "typedef double npy_float64;\n typedef float npy_float32;\n"
1492
# Generate the gpu functions
1495
for nd in range(1, nd + 1): # 1 to nd
1496
out = op(*[TensorType(i.gpu_nd_array.dtype,
1497
(False,) * nd)() for i in inputs])
1498
out_dtype = out.dtype
1500
elemwise_algo = ElemwiseAlgo(node.op.scalar_op)
1502
code = (CLUDA_PREAMBLE +
1504
elemwise_algo.c_src_kernel(node.inputs,
1508
fct_name = "kernel_%s_%d" % (nodename, nd)
1509
fct = compile_gpu_code(code, fct_name)
1512
# All inputs/outputs C contiguous case
1515
elemwise_algo.c_src_kernel_Ccontiguous(
1516
node.inputs, node.outputs, nodename, static=""))
1517
fct_name = "kernel_%s_Ccontiguous" % nodename
1518
fcts[0] = compile_gpu_code(code, fct_name)
1520
def call_fct2(inputs, out=None):
1521
" Do dimensions collapsing before call the gpu code "
1522
assert len(inputs) == nb_in
1523
# dtype checked by pycuda
1524
# TODO: assert nb dim?
1528
# Compute the output shape.
1529
out_shape = list(inp.shape)
1530
for i in inputs[1:]:
1531
for s_i in range(len(inp.shape)):
1532
assert (inp.shape[s_i] == i.shape[s_i]
1533
or inp.shape[s_i] == 1
1534
or i.shape[s_i] == 1)
1535
out_shape[s_i] = max(out_shape[s_i], i.shape[s_i])
1536
# Create the output object
1538
or out.dtype != out_dtype
1539
or out.shape != tuple(out_shape)):
1540
out = MyGpuNdArray(gpu_ndarray.empty(out_shape,
1544
# Do the collapsing.
1545
nd_col, info = elemwise_collapses(list(inputs), [out])
1546
# The two next line are usefull to force a call to the
1547
# c contiguous version:
1550
out = call_elemwise(fcts[nd_col], inputs,
1551
out=out, out_shape=info[0][:nd_col],
1554
out = call_elemwise(fcts[-1], inputs, out=out,
1555
out_shape=out_shape)
1559
def __elemwise2__(self, other, name, op):
1560
""" Call this code on this op with 2 inputs """
1561
nd = len(self.gpu_nd_array.shape) # self.gpu_nd_array.ndim
1562
assert nd == len(other.gpu_nd_array.shape) # ndim
1563
tag = (name + '_' + str(self.gpu_nd_array.dtype)
1564
+ str(self.gpu_nd_array.ndim))
1565
tag += ('_' + str(other.gpu_nd_array.dtype)
1566
+ str(other.gpu_nd_array.ndim))
1567
fct = self._compiled_fct.get(tag, None)
1569
# print "compile", tag
1570
fct = MyGpuNdArray.gen_fct(op, [self, other], nd)
1571
self._compiled_fct[tag] = fct
1572
return fct((self, other))
1575
def __elemwise__(cls, inputs, name, op, out=None):
1576
""" Call this code on this op with * inputs """
1577
nd = len(inputs[0].gpu_nd_array.shape) # self.gpu_nd_array.ndim
1578
for i in inputs[1:]:
1579
assert nd == len(i.gpu_nd_array.shape) # ndim
1581
tag = name + "_".join([str(i.gpu_nd_array.dtype) +
1582
str(i.gpu_nd_array.ndim) for i in inputs])
1583
fct = cls._compiled_fct.get(tag, None)
1585
# print "compile", tag
1586
fct = MyGpuNdArray.gen_fct(op, inputs, nd)
1587
cls._compiled_fct[tag] = fct
1588
return fct(inputs, out=out)
1590
base = property(lambda self: self.gpu_nd_array.base)
1591
bytes = property(lambda self: self.gpu_nd_array.bytes)
1592
dtype = property(lambda self: self.gpu_nd_array.dtype)
1593
flags = property(lambda self: self.gpu_nd_array.flags)
1594
itemsize = property(lambda self: self.gpu_nd_array.itemsize)
1595
ndim = property(lambda self: self.gpu_nd_array.ndim,
1596
doc="number of dimensions")
1597
offset = property(lambda self: self.gpu_nd_array.offset)
1598
shape = property(lambda self: self.gpu_nd_array.shape)
1599
size = property(lambda self: self.gpu_nd_array.size)
1600
strides = property(lambda self: self.gpu_nd_array.strides)
1602
def __array__(self):
1603
return numpy.asarray(self.gpu_nd_array)
1606
return MyGpuNdArray(self.gpu_nd_array.copy())
1609
return MyGpuNdArray(self.gpu_nd_array.view())
1612
return MyGpuNdArray(self.gpu_nd_array.__copy__())
1614
def __deepcopy__(self):
1615
return MyGpuNdArray(self.gpu_nd_array.__deepcopy__())
1619
# TODO: Add this assert when PyCUDA/PyOpenCL can use the bytes
1620
# attributes. Without this assert old code that don't support
1621
# strides can receive as input object that are strided and no
1622
# error will be gived
1624
#assert (self.gpu_nd_array.flags['C_CONTIGUOUS'] or
1625
# self.gpu_nd_array.flags['F_CONTIGUOUS'])
1627
# TODO: find a way to pass to a pycuda/pyopencl function the
1628
# bytes + offset directly.
1629
return self.bytes + self.offset
1631
def __getitem__(self, *inputs):
1632
return MyGpuNdArray(self.gpu_nd_array.__getitem__(*inputs))
1634
def __add__(self, other):
1635
return self.__elemwise2__(other, "add", theano.tensor.add)
1637
def __sub__(self, other):
1638
return self.__elemwise2__(other, "sub", theano.tensor.sub)
1640
def __mul__(self, other):
1641
return self.__elemwise2__(other, "mul", theano.tensor.mul)
1643
def __div__(self, other):
1644
assert (str(self.gpu_nd_array.dtype).startswith("float") or
1645
str(other.gpu_nd_array.dtype).startswith("float"))
1646
return self.__elemwise2__(other, "true_div", theano.tensor.true_div)
1649
def add(cls, x, y, out=None):
1650
""" add all inputs togethers element-wise """
1651
return cls.__elemwise__([x, y], "add", theano.tensor.add, out=out)
1654
def adds(cls, *inputs):
1655
""" add all inputs togethers element-wise """
1656
return cls.__elemwise__(inputs, "add", theano.tensor.add)
1659
def multiplys(cls, *inputs):
1660
""" multiply all inputs togethers element-wise """
1661
return cls.__elemwise__(inputs, "mul", theano.tensor.mul)
1663
def sum(self, axis=None, collapse=True):
1664
import gen_reduction
1665
max_thread_per_block = 512
1667
if isinstance(axis, (list, tuple)):
1671
assert len(axis) == self.ndim
1673
assert axis == range(self.ndim)
1678
make_out = gpu_ndarray.zeros
1680
make_out = gpu_ndarray.empty
1683
out = make_out((), self.dtype)
1684
out = MyGpuNdArray(out)
1686
out_shape = [self.shape[i] for i in range(self.ndim)
1688
out = make_out(out_shape, self.dtype)
1689
out = MyGpuNdArray(out)
1697
coll_ndim, (coll_shape, coll_strides, coll_axis), coll_out_str = (
1698
reduction_collapses([self, out], axis))
1700
coll_ndim = self.ndim
1701
coll_shape = self.shape
1702
coll_strides = self.strides
1704
coll_out_str = out.strides
1706
if axis is not None:
1707
coll_axis = coll_axis[0]
1712
sum_op = gen_reduction.GpuSum([1], self.dtype)
1713
c_code = sum_op.c_support_code_apply("nodename", contig=True)
1714
fctname = "kernel_reduce_sum_ccontig_nodename"
1715
fct = compile_gpu_code(c_code, fctname)
1716
block_ = min(coll_shape[0], max_thread_per_block)
1717
block = (block_, 1, 1)
1720
shared_ = self.dtype.itemsize * block_
1721
args = [cast_int(coll_shape[0]), self, out]
1724
pattern = [1] * coll_ndim
1725
str_pattern = [str(i) for i in pattern]
1726
sum_op = gen_reduction.GpuSum(pattern, self.dtype)
1727
c_code = sum_op.c_support_code_apply("nodename")
1729
raise NotImplementedError(
1730
"GpuNdArray sum case not implemented")
1731
fctname = "kernel_reduce_sum_" + "".join(str_pattern) + "_nodename"
1732
fct = compile_gpu_code(c_code, fctname)
1734
bx = min(max_thread_per_block, coll_shape[0])
1737
elif coll_ndim == 2:
1738
bx = min(max_thread_per_block, coll_shape[1])
1739
by = min(max_thread_per_block // coll_shape[1], coll_shape[0])
1743
elif coll_ndim == 3:
1744
bx = min(max_thread_per_block, coll_shape[2])
1745
by = min(max_thread_per_block // bx, coll_shape[1])
1746
bz = min(max_thread_per_block // (bx * by), coll_shape[0])
1748
bz = min(max(bz, 1), 64)
1749
block = (bx, by, bz)
1750
block_ = bx * by * bz
1751
elif coll_ndim == 4:
1752
bx = min(max_thread_per_block, coll_shape[3])
1753
by = min(max_thread_per_block // bx, coll_shape[2])
1754
bz = min(max_thread_per_block // (bx * by), coll_shape[1])
1756
bz = min(max(bz, 1), 64)
1757
block = (bx, by, bz)
1758
block_ = bx * by * bz
1760
shared_ = self.dtype.itemsize * block_
1761
elif coll_ndim in [1, 2, 3]:
1763
assert coll_axis == 0
1765
sum_op = gen_reduction.GpuSum([1], self.dtype)
1766
fctname = "kernel_reduce_sum_1_nodename"
1770
block_ = min(max_thread_per_block, coll_shape[0])
1771
block = (block_, 1, 1)
1772
elif coll_ndim == 3 and coll_axis == 0:
1774
sum_op = gen_reduction.GpuSum([1, 0, 0], self.dtype)
1775
fctname = "kernel_reduce_sum_100_nodename"
1777
gx = min(coll_shape[1], max_block)
1778
gy = min(max_block // (gx * coll_shape[2]), coll_shape[2])
1782
block_ = min(max_thread_per_block, coll_shape[0])
1783
block = (block_, 1, 1)
1784
elif coll_ndim == 3 and coll_axis == 1:
1786
sum_op = gen_reduction.GpuSum([0, 1, 0], self.dtype)
1787
fctname = "kernel_reduce_sum_010_AD_nodename"
1795
assert ((C <= 32 * D) and (32 * D < C + 32))
1798
gx = min(A, max_block)
1799
gy = min(max_block // (D * A), D)
1808
args = [cast_int(A), cast_int(B),
1809
cast_int(C), cast_int(D)]
1813
args += [cast_int(i / self.dtype.itemsize)
1814
for i in coll_strides]
1818
args.append(cast_int(coll_out_str[0] / out.dtype.itemsize))
1819
args.append(cast_int(coll_out_str[1] / out.dtype.itemsize))
1820
elif coll_ndim == 3 and coll_axis == 2:
1822
sum_op = gen_reduction.GpuSum([0, 0, 1], self.dtype)
1823
fctname = "kernel_reduce_sum_001_nodename"
1825
gx = min(coll_shape[0], max_block)
1826
gy = min(max_block // (gx * coll_shape[1]), coll_shape[1])
1830
block_ = min(max_thread_per_block, coll_shape[2])
1831
block = (block_, 1, 1)
1832
elif coll_axis == 0:
1834
sum_op = gen_reduction.GpuSum([1, 0], self.dtype)
1835
fctname = "kernel_reduce_sum_010_nodename"
1836
block_ = min(coll_shape[1], max_thread_per_block)
1837
block = (block_, 1, 1)
1838
grid = (1, coll_shape[0])
1841
args = [cast_int(1)]
1842
args += [cast_int(i) for i in coll_shape]
1846
args.append(cast_int(1))
1847
args += [cast_int(i / self.dtype.itemsize)
1848
for i in coll_strides]
1852
args.append(cast_int(1))
1853
# We must take the last dimensions in the case of
1854
# dimensions collapsing.
1855
args.append(cast_int(coll_out_str[-1] / out.dtype.itemsize))
1856
elif coll_axis == 1:
1858
sum_op = gen_reduction.GpuSum([0, 1], self.dtype)
1859
fctname = "kernel_reduce_sum_01_nodename"
1860
block_ = min(coll_shape[1], max_thread_per_block)
1861
block = (block_, 1, 1)
1862
grid = (1, min(coll_shape[0], max_block))
1864
raise Exception("Bad axis")
1866
c_code = sum_op.c_support_code_apply("nodename")
1867
fct = compile_gpu_code(c_code, fctname)
1869
shared_ = self.dtype.itemsize * block_
1871
raise Exception("Not implemented")
1875
args = [cast_int(i) for i in coll_shape]
1879
args += [cast_int(i / self.dtype.itemsize)
1880
for i in coll_strides]
1884
args += [cast_int(i / self.dtype.itemsize)
1885
for i in coll_out_str]
1887
pycuda._driver.Context.synchronize()
1888
#print fctname, block, grid, shared_, axis
1889
#print self.ndim, self.shape, self.strides, axis, out.strides
1890
#print coll_ndim, coll_shape, coll_strides, coll_axis, coll_out_str
1894
d = {"block": block,
1899
# We bypass the pycuda wrapper gpu function call.
1900
# by calling directly the gpu function.
1901
# This is faster and lower the overhead.
1902
fct.set_block_shape(*block)
1903
fct.set_shared_size(shared_)
1904
fct.param_set(*args)
1905
fct.launch_grid(*grid)