~ubuntu-branches/ubuntu/wily/pyopencl/wily

Viewing changes to pyopencl/compyte/ndarray/gen_elemwise.py

Committer: Package Import Robot
Author(s): Tomasz Rybak
Date: 2012-06-21 21:18:03 UTC
mfrom: (2.1.6 sid)
Revision ID: package-import@ubuntu.com-20120621211803-424gma7uil7lje4u

* New upstream release.
* Depend on free ocl-icd-libopencl1 instead of non-free libraries.
* Add NEWS entry describing OpenCL library dependencies.
* Change my email.
* Move python-pyopengl from Recommends to Suggests because package
can be used for computations without any graphical environment.

files added:
.pc/replace-setuptools.patch/Makefile.in

contrib

contrib/fortran-to-opencl

contrib/fortran-to-opencl/README

contrib/fortran-to-opencl/translate.py

contrib/pyopencl.vim

debian/NEWS

debian/python-pyopencl-doc.doc-base

debian/python-pyopencl-doc.docs

debian/python-pyopencl-doc.examples

debian/python-pyopencl-headers.install

debian/python-pyopencl.install

debian/python3-pyopencl.install

doc/source/tools.rst

examples/demo_array.py

examples/demo_elementwise_complex.py

examples/dump-performance.py

pyopencl/_cluda.py

pyopencl/_mymako.py

pyopencl/cache.py

pyopencl/characterize

pyopencl/characterize/__init__.py

pyopencl/characterize/performance.py

pyopencl/compyte

pyopencl/compyte/__init__.py

pyopencl/compyte/array.py

pyopencl/compyte/dtypes.py

pyopencl/compyte/ndarray

pyopencl/compyte/ndarray/Makefile

pyopencl/compyte/ndarray/__init__.py

pyopencl/compyte/ndarray/gen_elemwise.py

pyopencl/compyte/ndarray/gen_reduction.py

pyopencl/compyte/ndarray/gpu_ndarray.h

pyopencl/compyte/ndarray/pygpu_language.h

pyopencl/compyte/ndarray/pygpu_language_cuda.cu

pyopencl/compyte/ndarray/pygpu_language_opencl.cpp

pyopencl/compyte/ndarray/pygpu_ndarray.cpp

pyopencl/compyte/ndarray/pygpu_ndarray.h

pyopencl/compyte/ndarray/pygpu_ndarray_object.h

pyopencl/compyte/ndarray/setup_opencl.py

pyopencl/compyte/ndarray/test_gpu_elemwise.py

pyopencl/compyte/ndarray/test_gpu_ndarray.py

pyopencl/compyte/scan.py

pyopencl/reduction.py

pyopencl/scan.py

src/cl

src/cl/pyopencl-airy.cl

src/cl/pyopencl-bessel-j.cl

src/cl/pyopencl-cephes.cl

src/cl/pyopencl-complex.h

src/cl/pyopencl-ranluxcl.cl

src/wrapper/_pvt_struct_v2.cpp

src/wrapper/_pvt_struct_v3.cpp

src/wrapper/bitlog.cpp

src/wrapper/bitlog.hpp

src/wrapper/mempool.hpp

src/wrapper/tools.hpp

src/wrapper/wrap_mempool.cpp

test/empty-header.h

files removed:
debian/docs

debian/python-pyopencl.examples

debian/python-pyopencl.links

doc/source/.gitignore

files modified:
.pc/python-versions.patch/setup.py

.pc/replace-setuptools.patch/MANIFEST.in

.pc/replace-setuptools.patch/aksetup_helper.py

MANIFEST.in

Makefile.in

aksetup_helper.py

debian/README.Debian

debian/changelog

debian/compat

debian/control

debian/copyright

debian/patches/python-versions.patch

debian/patches/replace-setuptools.patch

debian/rules

distribute_setup.py

doc/make_constants.py

doc/source/array.rst

doc/source/conf.py

doc/source/index.rst

doc/source/misc.rst

doc/source/runtime.rst

examples/benchmark-all.py

examples/demo.py

examples/demo_elementwise.py

examples/demo_mandelbrot.py

examples/demo_meta_template.py

examples/dump-properties.py

examples/gl_interop_demo.py

examples/matrix-multiply.py

examples/narray.py

examples/transpose.py

pyopencl/__init__.py

pyopencl/array.py

pyopencl/clmath.py

pyopencl/clrandom.py

pyopencl/elementwise.py

pyopencl/tools.py

pyopencl/version.py

setup.py

src/wrapper/numpy_init.hpp

src/wrapper/wrap_cl.cpp

src/wrapper/wrap_cl.hpp

src/wrapper/wrap_cl_part_1.cpp

src/wrapper/wrap_cl_part_2.cpp

src/wrapper/wrap_constants.cpp

test/test_array.py

test/test_clmath.py

test/test_wrapper.py

Show diffs side-by-side

added added

removed removed

pyopencl/compyte/ndarray/gen_elemwise.py

"""

This file implement 1 version of the elemwise op on the gpu.

The elemwise fct are also used with scalar operation! So it can happen

that ndim is 0 as with all scalar type.

"""

import numpy

import StringIO

import pygpu_ndarray as gpu_ndarray

_CL_MODE = hasattr(gpu_ndarray, "set_opencl_context")

if _CL_MODE:

# THIS IS NOT FINISHED

import pyopencl as cl

import pyopencl.array as cl_array

from pyopencl.tools import dtype_to_ctype

# import pyopencl._mymako as mako

from pyopencl._cluda import CLUDA_PREAMBLE

# TODO: use mako to get rid of the %if

CLUDA_PREAMBLE = CLUDA_PREAMBLE[:455]

CLUDA_PREAMBLE += """

#define LDIM_0 get_local_size(0)

#define LDIM_1 get_local_size(1)

#define LDIM_2 get_local_size(2)

#define GDIM_0 get_num_groups(0)

#define GDIM_1 get_num_groups(1)

#define GDIM_2 get_num_groups(2)

"""

# TODO, reuse the same context as the use used to create the memory.

ctx = cl.create_some_context()

queue = cl.CommandQueue(ctx)

else:

import pycuda.autoinit

import pycuda.driver as driver

from pycuda.compiler import SourceModule

from pycuda.tools import dtype_to_ctype

# import pycuda._mymako as mako

from pycuda._cluda import CLUDA_PREAMBLE

CLUDA_PREAMBLE += """

#define LDIM_0 blockDim.x

#define LDIM_1 blockDim.y

#define LDIM_2 blockDim.z

#define GDIM_0 gridDim.x

#define GDIM_1 gridDim.y

#define GDIM_2 gridDim.z

"""

from theano import Apply

from theano import scalar

from theano.tensor import TensorType

import theano

import logging

_logger_name = 'compyte.gen_elemwise'

_logger = logging.getLogger(_logger_name)

_logger.setLevel(logging.INFO)

_logger.addHandler(logging.StreamHandler()) # TO REMOVE

def warning(*msg):

_logger.warning(_logger_name + 'WARNING: ' + ' '.join(str(m) for m in msg))

def info(*msg):

_logger.info(_logger_name + 'INFO: ' + ' '.join(str(m) for m in msg))

def debug(*msg):

_logger.debug(_logger_name + 'DEBUG: ' + ' '.join(str(m) for m in msg))

if _CL_MODE:

gpu_ndarray.set_opencl_context(ctx.obj_ptr)

cast_int = numpy.intc

cast_uint = numpy.uintc

def _logical_scalar(x):

return numpy.all(x.type.broadcastable)

def get_str_list_logical_scalar(inputs, value_str='ii_i%i_value',

data_str='ii_i%i_data[0]'):

l = []

for ipos, i in enumerate(inputs):

if _logical_scalar(i):

l += [value_str % ipos]

else:

l += [data_str % ipos]

return l

100

101

class WrapOpenCLFunction(object):

102

def __init__(self, fct):

103

self.fct = fct

104

105

def _param_wrap(self, p):

106

if isinstance(p, MyGpuNdArray):

107

p = p.gpu_nd_array

108

if isinstance(p, gpu_ndarray.GpuNdArrayObject):

109

p = cl.MemoryObject.from_cl_mem_as_int(p.bytes)

110

return p

111

112

def set_block_shape(self, *shape):

113

self.local_size = shape

114

115

def param_set(self, *param):

116

self.param = [self._param_wrap(p) for p in param]

117

118

def launch_grid(self, *global_shape):

119

global_size = global_shape + (1,)

120

121

d = {"g_times_l": True}

122

return self.fct(queue, global_size, self.local_size,

123

*self.param, **d)

124

125

126

def compile_gpu_code(code, fct_name):

127

if _CL_MODE:

128

# Compile the gpu function with pyopencl

129

prg = cl.Program(ctx, code).build()

130

fct2 = getattr(prg, fct_name)

131

132

fct = WrapOpenCLFunction(fct2)

133

else:

134

# Compile the gpu function with pycuda

135

mod = SourceModule(code)

136

fct = mod.get_function(fct_name)

137

return fct

138

139

140

class ElemwiseAlgo(object):

141

verbose = 0 # 1, 2 or 3 for more verbose output.

142

cache_version = ()

143

cache_version = ('debug', 14, verbose)

144

145

def __init__(self, scalar_op, inplace_pattern={}):

146

"""

147

:param scalar_op: the scalar operation to execute on each element.

148

"""

149

self.scalar_op = scalar_op

150

self.inplace_pattern = inplace_pattern

151

152

def task_code(self, inputs, outputs, sio,

153

nodename, iname=None, oname=None):

154

if iname == None:

155

iname = get_str_list_logical_scalar(inputs)

156

if oname == None:

157

oname = ['ii_o%i_data[0]' % ipos for ipos, i in enumerate(outputs)]

158

print >> sio, self.scalar_op.c_code(

159

Apply(self.scalar_op,

160

[scalar.Scalar(dtype=input.type.dtype)()

161

for input in inputs],

162

[scalar.Scalar(dtype=output.type.dtype)()

163

for output in outputs]),

164

nodename + '_scalar_',

165

iname,

166

oname,

167

sub=dict(fail='return;')) # TODO: set a failure code somehow!!!

168

169

def c_src_kernel(self, inputs, outputs, nodename, nd, static="static"):

170

sio = StringIO.StringIO()

171

#print 'C_SRC_KERNEL', sio.getvalue()

172

173

for ipos, i in enumerate(inputs):

174

print >> sio, "// Input ", ipos, str(i.type)

175

for ipos, i in enumerate(outputs):

176

print >> sio, "// Output ", ipos, str(i.type)

177

print >> sio, static, (

178

"KERNEL void kernel_%s_%s(unsigned int numEls" % (nodename, nd))

179

if (nd):

180

print >> sio, "\t,", ", ".join("const int dim%i" % i

181

for i in xrange(nd))

182

#declare inputs

183

for ipos, i in enumerate(inputs):

184

s = ", ".join(["GLOBAL_MEM const %s * i%i_data" % (

185

dtype_to_ctype(i.dtype), ipos)] +

186

list("int i%i_str_%i" % (ipos, d)

187

for d in xrange(nd)))

188

print >> sio, "\t,", s

189

#declare outputs

190

for ipos, i in enumerate(outputs):

191

s = ", ".join(["GLOBAL_MEM %s * o%i_data" % (

192

dtype_to_ctype(i.dtype), ipos)]

193

+ list("int o%i_str_%i" % (ipos, d)

194

for d in xrange(nd)))

195

print >> sio, "\t,", s

196

#print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d)

197

# for d in xrange(nd))

198

#print >> sio, "\t,", "float * o%i_data" % ipos

199

print >> sio, "\t)\n{"

200

print >> sio, " const int idx = GID_0 * LDIM_0 + LID_0;"

201

print >> sio, " const int numThreads = LDIM_0 * GDIM_0;"

202

203

# For each input that is a scalar which has been broadcasted

204

# to a tensor, load it into a local variable

205

for ipos, i in enumerate(inputs):

206

if _logical_scalar(i):

207

print >> sio, " const %s ii_i%i_value = i%i_data[0];" % (

208

dtype_to_ctype(i.dtype), ipos, ipos)

209

210

#loop over the elements to be treated by this kernel call

211

print >> sio, " for (int i = idx; i < numEls; i += numThreads) {"

212

# calculate the data pointers for all arguments

213

print >> sio, " int ii = i;"

214

for ipos, i in enumerate(inputs):

215

if not _logical_scalar(i):

216

print >> sio, (" GLOBAL_MEM const "

217

"%s * ii_i%i_data = i%i_data;" % (

218

dtype_to_ctype(i.dtype), ipos, ipos))

219

for ipos, i in enumerate(outputs):

220

print >> sio, " GLOBAL_MEM %s * ii_o%i_data = o%i_data;" % (

221

dtype_to_ctype(i.dtype), ipos, ipos)

222

for d in xrange(nd - 1, -1, -1):

223

if d > 0:

224

print >> sio, " int pos%i = ii %% dim%i;" % (d, d)

225

print >> sio, " ii = ii / dim%i;" % d

226

else:

227

print >> sio, " int pos%i = ii;" % d

228

229

for ipos, i in enumerate(inputs):

230

if not _logical_scalar(i):

231

print >> sio, (" ii_i"

232

"%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d))

233

for ipos, i in enumerate(outputs):

234

print >> sio, " ii_o%i_data += pos%i * o%i_str_%i;" % (

235

ipos, d, ipos, d)

236

237

# perform the scalar operation on the input and output references

238

#TODO: What if the scalar_op needs support_code??

239

self.task_code(inputs, outputs, sio, nodename)

240

print >> sio, " }"

241

242

#indent = " "*(4*d+7)

243

#for ipos, i in enumerate(inputs):

244

#print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''

245

print >> sio, "}"

246

247

#print sio.getvalue()

248

return sio.getvalue()

249

250

def c_src_kernel_Ccontiguous(self, inputs, outputs,

251

nodename, static="static"):

252

nd = outputs[0].type.ndim

253

sio = StringIO.StringIO()

254

#print 'C_SRC_KERNEL', sio.getvalue()

255

256

for ipos, i in enumerate(inputs):

257

print >> sio, "// Input ", ipos, str(i.type)

258

for ipos, i in enumerate(outputs):

259

print >> sio, "// Output ", ipos, str(i.type)

260

print >> sio, static, ("KERNEL void kernel_%s_Ccontiguous"

261

" (unsigned int numEls" % (nodename))

262

#declare inputs

263

for ipos, i in enumerate(inputs):

264

print >> sio, "\t,", "GLOBAL_MEM const %s * i%i_data" % (

265

dtype_to_ctype(i.dtype), ipos)

266

#declare outputs

267

for ipos, i in enumerate(outputs):

268

print >> sio, "\t,", "GLOBAL_MEM %s * o%i_data" % (

269

dtype_to_ctype(i.dtype), ipos)

270

print >> sio, "\t)\n{"

271

print >> sio, " const int idx = GID_0 * LDIM_0 + LID_0;"

272

print >> sio, " const int numThreads = LDIM_0 * GDIM_0;"

273

274

# For each input that is a scalar which has been broadcasted

275

# to a tensor, load it into a local variable

276

for ipos, i in enumerate(inputs):

277

if _logical_scalar(i):

278

print >> sio, " const %s ii_i%i_value = i%i_data[0];" % (

279

dtype_to_ctype(i.dtype), ipos, ipos)

280

281

#loop over the elements to be treated by this kernel call

282

print >> sio, " for (int i = idx; i < numEls; i += numThreads) {"

283

# perform the scalar operation on the input and output references

284

#TODO: What if the scalar_op needs support_code??

285

self.task_code(inputs, outputs, sio, nodename,

286

iname=get_str_list_logical_scalar(

287

inputs, data_str='i%i_data[i]'),

288

oname=['o%i_data[i]' % ipos

289

for ipos, i in enumerate(outputs)])

290

print >> sio, " }"

291

print >> sio, "}"

292

293

#print sio.getvalue()

294

return sio.getvalue()

295

296

def c_src_callkernel(self, inputs, outputs, nodename):

297

298

# This function serves three main goals:

299

300

# The first is stride unpacking:

301

# it accepts input and output arguments as

302

# float * , int*

303

# pairs, and it constructs a kernel function call where inputs

304

# and arguments are named like

305

# float *, int, int, int ...

306

307

# The second is to recognize when any dimensions can be collapsed as

308

# being contiguous. That mean that we can merge that dimensions with

309

# another one for all inputs/outputs and have the same retusuls

310

# (confusing... read code)

311

312

# The thrid is to make a special case for scalar element. We allow

313

# the collapsing of them. In the ccontiguous and not contiguous case,

314

# we use registers to lower the number of memory access.

315

316

# TODO: make a special case for broadcasting, to store the

317

# data in shared memory.

318

319

nd = outputs[0].type.ndim

320

nb_inputs = len(inputs)

321

nb_outputs = len(outputs)

322

d = dict()

323

# input_params and output_params go into the function

324

# declaration/definition

325

input_params = ", ".join("const %s * i%i_data, const int * i%i_str" % (

326

dtype_to_ctype(inputs[i].dtype), ipos, ipos)

327

for ipos in xrange(len(inputs)))

328

output_params = ", ".join("%s * o%i_data, const int * o%i_str" % (

329

dtype_to_ctype(outputs[i].dtype),

330

ipos, ipos)

331

for ipos in xrange(len(outputs)))

332

333

#input_args and output_args go into the recursive call.

334

input_args = ", ".join("i%i_data, i%i_str" % (ipos, ipos)

335

for ipos in xrange(len(inputs)))

336

output_args = ", ".join("o%i_data, o%i_str" % (ipos, ipos)

337

for ipos in xrange(len(outputs)))

338

339

prod_dims = '*'.join(["dims[%i]" % di for di in xrange(nd)] + ['1'])

340

341

sio = StringIO.StringIO()

342

print >> sio, """

343

static void can_collapse_%(nodename)s(int nd, const int * dims,

344

const int * strides,

345

int collapse[])

346

{

347

//can we collapse dims[i] and dims[i-1]

348

for(int i=nd-1;i>0;i--){

349

if(strides[i]*dims[i]==strides[i-1]){

350

//the dims nd-1 are not strided again dimension nd

351

collapse[i]=1;

352

}else collapse[i]=0;

353

}

354

}

355

""" % locals()

356

print >> sio, """

357

static int callkernel_%(nodename)s(unsigned int numEls, const int d,

358

const int * dims,

359

%(input_params)s,

360

%(output_params)s)

361

{

362

numEls = %(prod_dims)s;

363

""" % locals()

364

if self.verbose:

365

print >> sio, """

366

std::cerr << "calling kernel_%(nodename)s w numEls" << numEls << " dims"<< d << "\\n";

367

""" % locals()

368

print >> sio, 'std::cerr << ' + " << ' ' << ".join(['" "']+list("dims[%i]"%di

369

for di in xrange(nd)) + ["'\\n';"])

370

if self.verbose > 1:

371

for ipos in xrange(len(inputs)):

372

print >> sio, """

373

std::cerr << " %(ipos)s data strides" <<

374

""" % locals() + " << ' ' << ".join(["i%s_data" % ipos]

375

+ list("i%s_str[%i]" % (ipos, di)

376

for di in xrange(nd))) + ''' << "\\n"; '''

377

378

for ipos in xrange(len(outputs)):

379

print >> sio, """

380

std::cerr << " %(ipos)s data strides" <<

381

""" % locals() + " << ' ' << ".join(["o%s_data" % ipos]

382

+ list("o%s_str[%i]" % (ipos, di)

383

for di in xrange(nd))) + ''' << "\\n"; '''

384

# collapse dimension that are broadcast in all inputs.

385

# need to be done before contiguous collapse as it will break it.

386

# do the dimensions and the strides

387

print >> sio, """

388

int local_dims[%(nd)s];

389

int local_str[%(nb_inputs)s][%(nd)s];

390

int local_ostr[%(nb_inputs)s][%(nd)s];

391

int nd_collapse = %(nd)s;

392

for(int i=0;i<%(nd)s;i++){//init new dim

393

local_dims[i]=dims[i];

394

}

395

""" % locals()

396

for ipos in xrange(len(inputs)):

397

print >> sio, """

398

for(int i=0;i<%(nd)s;i++){//init new strides

399

local_str[%(ipos)s][i]=i%(ipos)s_str[i];

400

}

401

""" % locals()

402

for ipos in xrange(len(outputs)):

403

print >> sio, """

404

for(int i=0;i<%(nd)s;i++){//init new strides

405

local_ostr[%(ipos)s][i]=o%(ipos)s_str[i];

406

}

407

""" % locals()

408

if self.verbose > 2:

409

print >>sio, 'std::cerr <<"before broadcast collapse\\n";'

410

print >>sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; '

411

print >> sio, 'std::cerr << "local_dims";'

412

for d in xrange(nd):

413

print >> sio, 'std::cerr << " " << local_dims[%(d)s]; ' % locals()

414

print >> sio, 'std::cerr << "\\n";'

415

416

for ipos in xrange(len(inputs)):

417

print >> sio, 'std::cerr << " local_str inputs %(ipos)s: " <<' % locals()+' << " " << '.join(["local_str[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'

418

for ipos in xrange(len(outputs)):

419

print >> sio, 'std::cerr << " local_ostr inputs %(ipos)s: " <<' % locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'

420

421

print >> sio, """

422

for(int id=0;id<nd_collapse;id++){

423

424

bool all_broadcast=true;

425

for(int input_id=0;input_id<%(nb_inputs)s;input_id++){

426

if(local_str[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false;

427

}

428

for(int input_id=0;input_id<%(nb_outputs)s;input_id++){

429

if(local_ostr[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false;

430

}

431

if(all_broadcast){

432

for(int j=id+1;j<nd_collapse;j++)//remove dims i from the array

433

local_dims[j-1]=local_dims[j];

434

for(int input_id=0;input_id<%(nb_inputs)s;input_id++){

435

for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array

436

local_str[input_id][j-1]=local_str[input_id][j];

437

}

438

}

439

for(int output_id=0;output_id<%(nb_outputs)s;output_id++){

440

for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array

441

local_ostr[output_id][j-1]=local_ostr[output_id][j];

442

}

443

}

444

nd_collapse--; id--;

445

}

446

}

447

""" % locals()

448

449

if self.verbose > 2:

450

print >>sio, 'std::cerr <<"after broadcast collapse\\n";'

451

print >>sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; '

452

print >> sio, 'std::cerr << "local_dims";'

453

for d in xrange(nd):

454

print >> sio, 'std::cerr << " " << local_dims[%(d)s]; ' % locals()

455

print >> sio, 'std::cerr << "\\n";'

456

457

for ipos in xrange(len(inputs)):

458

print >> sio, 'std::cerr << " local_str %(ipos)s: " <<' % locals()+' << " " << '.join(["local_str[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'

459

for ipos in xrange(len(outputs)):

460

print >> sio, 'std::cerr << " local_ostr %(ipos)s: " <<' % locals()+' << " " << '.join(["local_ostr[%(ipos)s][%(x)s]"%locals() for x in range(nd)])+'<<"\\n";'

461

# collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle))

462

# this is a good idea because we make less index calculation in the gpu.

463

464

print >> sio, "int nd_collapse_[%(nd)s] = {" % locals() +','.join(['1' for x in range(nd)]) +"};"

465

for ipos in xrange(len(inputs)):

466

if not _logical_scalar(inputs[ipos]):

467

print >> sio, """

468

int nd_collapse_%(ipos)s[%(nd)s] = {""" % locals() +','.join(['1' for x in range(nd)]) +"};"

469

print >> sio, """

470

can_collapse_%(nodename)s(nd_collapse, local_dims, local_str[%(ipos)s], nd_collapse_%(ipos)s);

471

for(int i=0;i<nd_collapse;i++){

472

if(nd_collapse_%(ipos)s[i]==0)

473

nd_collapse_[i]=0;

474

}

475

""" % locals()

476

if self.verbose > 1:

477

print >>sio, """

478

std::cerr<< "nd_collapse_%(ipos)s "<<

479

""" % locals()

480

print >>sio, ' << " " << '.join(

481

["nd_collapse_%(ipos)s[" % locals() + str(i) + "]"

482

for i in range(nd)])

483

print >>sio, '<< "\\n";'

484

print >>sio, """

485

std::cerr<< "nd_collapse_ "<<

486

""" % locals()

487

print >>sio, ' << " " << '.join(

488

["nd_collapse_[" % locals() + str(i) + "]"

489

for i in range(nd)])

490

print >>sio, '<< "\\n";'

491

492

# update the local stride.

493

for ipos in xrange(len(inputs)):

494

print >> sio, """

495

for(int i=nd_collapse-1;i>0;i--){

496

if(nd_collapse_[i]==1){

497

local_str[%(ipos)s][i-1]=local_str[%(ipos)s][i];//set new strides

498

for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array

499

local_str[%(ipos)s][j-1]=local_str[%(ipos)s][j];

500

}

501

}

502

""" % locals()

503

504

for ipos in xrange(len(outputs)):

505

print >> sio, """

506

for(int i=nd_collapse-1;i>0;i--){

507

if(nd_collapse_[i]==1){

508

local_ostr[%(ipos)s][i-1]=local_ostr[%(ipos)s][i];//set new strides

509

for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array

510

local_ostr[%(ipos)s][j-1]=local_ostr[%(ipos)s][j];

511

}

512

}

513

""" % locals()

514

515

# update the local dims.

516

print >> sio, """

517

for(int i=nd_collapse-1;i>0;i--){

518

if(nd_collapse_[i]==1){

519

local_dims[i-1]*=local_dims[i];//set new dims

520

for(int j=i+1;j<nd_collapse;j++)//remove dims i from the array

521

local_dims[j-1]=local_dims[j];

522

}

523

}

524

""" % locals()

525

526

#update the new number of dim

527

print >> sio, """

528

for(int i=1, end=nd_collapse;i<end;i++){

529

if(nd_collapse_[i]==1)nd_collapse--;

530

}

531

if(nd_collapse == 1 """ % locals()

532

l = ["local_str[%(ipos)s][nd_collapse-1]==1 " % locals()

533

for ipos in range(len(inputs))

534

if not _logical_scalar(inputs[ipos])]

535

l += ["local_ostr[%(ipos)s][nd_collapse-1]==1 " % locals()

536

for ipos in range(len(outputs))

537

if not _logical_scalar(outputs[ipos])]

538

if len(l) > 0:

539

print >> sio, " && ", " && ".join(l)

540

print >> sio, """){nd_collapse=0;} """

541

542

if self.verbose:

543

print >> sio, 'std::cerr <<"after can_collapse\\n";'

544

print >> sio, """std::cerr << "nd_collapse " << nd_collapse << "\\n"; """ % locals()

545

if self.verbose > 1:

546

for d in xrange(nd):

547

print >> sio, 'std::cerr << " " << local_dims[%(d)s]; ' % locals()

548

print >> sio, 'std::cerr << "\\n";'

549

550

for ipos in xrange(len(inputs)):

551

print >> sio, ('std::cerr << " local_str %(ipos)s: " <<' %

552

locals() + ' << " " << '.join(

553

["local_str[%(ipos)s][%(x)s]" % locals()

554

for x in range(nd)]) + '<<"\\n";')

555

for ipos in xrange(len(outputs)):

556

print >> sio, ('std::cerr << " local_ostr %(ipos)s: " <<' %

557

locals() + ' << " " << '.join(

558

["local_ostr[%(ipos)s][%(x)s]" % locals()

559

for x in range(nd)]) + '<<"\\n";')

560

561

def launch_Ccontiguous(nodename, scalar_op):

562

kernel_call_args = ["numEls"]

563

for ipos in xrange(len(inputs)):

564

kernel_call_args.append("i%i_data" % ipos)

565

for ipos in xrange(len(outputs)):

566

kernel_call_args.append("o%i_data" % ipos)

567

kernel_call_args = ", ".join(kernel_call_args)

568

verb = ""

569

if self.verbose:

570

verb = 'std::cerr << " Running ccontiguous version\\n";'

571

print >> sio, """

572

//first use at least a full warp

573

int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE

574

575

//next start adding multiprocessors

576

int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS

577

578

// next start adding more warps per multiprocessor

579

if (threads_per_block * n_blocks < numEls)

580

threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);

581

kernel_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);

582

583

//std::cerr << "calling callkernel returned\\n";

584

""" % locals()

585

586

print >> sio, """

587

CNDA_THREAD_SYNC;

588

cudaError_t err = cudaGetLastError();

589

if( cudaSuccess != err)

590

{

591

PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n n_blocks=%%i threads_per_block=%%i\\n Call: %%s\\n",

592

"GpuElemwise %(nodename)s", cudaGetErrorString(err),

593

n_blocks, threads_per_block,

594

"kernel_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s)");

595

return -1;

596

597

}

598

%(verb)s

599

return 0;

600

""" % locals()

601

602

def launch_General(nodename, scalar_op, force_nd):

603

# kernel_call_args are used to invoke the cuda kernel

604

local = "local_"

605

kernel_call_args = ["numEls"]

606

kernel_call_args.extend(local + "dims[%i]" % di

607

for di in xrange(force_nd))

608

for ipos in xrange(len(inputs)):

609

kernel_call_args += ["i%i_data" % ipos] + list(

610

local + "str[%i][%i]" % (ipos, di)

611

for di in xrange(force_nd))

612

#strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(force_nd))

613

#kernel_call_args.append( "%s, i%i_data" % (strides, ipos))

614

for ipos in xrange(len(outputs)):

615

kernel_call_args += ["o%i_data" % ipos] + list(

616

local + "ostr[%i][%i]" % (ipos, di)

617

for di in xrange(force_nd))

618

#strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(force_nd))

619

#kernel_call_args.append( "%s, o%i_data" % (strides, ipos))

620

if self.verbose:

621

print >> sio, """

622

std::cerr << " Running general version with %(force_nd)s dims\\n";

623

""" % locals()

624

print >> sio, "std::cerr << "+ ' << " " << '.join(

625

kernel_call_args)+' << "\\n";'

626

#std::cerr << numEls << dims[0] << i0_data, i0_str[0] << o0_data, o0_str[0]\n;

627

628

kernel_call_args = ", ".join(kernel_call_args)

629

630

print >> sio, """

631

//first use at least a full warp

632

int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE

633

634

//next start adding multiprocessors

635

int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS

636

637

// next start adding more warps per multiprocessor

638

if (threads_per_block * n_blocks < numEls)

639

threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);

640

641

kernel_%(nodename)s_%(force_nd)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);

642

""" % locals()

643

print >> sio, """

644

CNDA_THREAD_SYNC;

645

cudaError_t err = cudaGetLastError();

646

if( cudaSuccess != err)

647

{

648

PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n n_blocks=%%i threads_per_block=%%i\\n Call: %%s\\n",

649

"GpuElemwise %(nodename)s", cudaGetErrorString(err),

650

n_blocks, threads_per_block,

651

"kernel_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s)");

652

return -1;

653

654

}

655

return 0;

656

""" % locals()

657

658

print >> sio, "if(numEls==0) return 0;"

659

print >> sio, "switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {"%locals()

660

print >> sio, "case 0: {"

661

launch_Ccontiguous(nodename, scalar_op)

662

print >> sio, " } break;"

663

for i in range(1, nd + 1):

664

print >> sio, "case " + str(i) + ": {"

665

launch_General(nodename, scalar_op, i)

666

print >> sio, " } break;"

667

668

print >> sio, "}" # end case

669

print >> sio, "return -2;" # should not get to this point

670

print >> sio, "}" # end fct

671

672

#N.B. cudaGetLastError is called by c_code

673

return sio.getvalue()

674

675

def c_support_code_apply(self, inputs, outputs, nodename):

676

nd = outputs[0].type.ndim

677

return "".join(

678

CLUDA_PREAMBLE,

679

[self.c_src_kernel(inputs, outputs, nodename, x)

680

for x in range(1, nd + 1)] +

681

[self.c_src_kernel_Ccontiguous(inputs, outputs, nodename),

682

self.c_src_callkernel(inputs, outputs, nodename),

683

])

684

685

def c_code(self, ninputs, noutputs, nodename, inputs, outputs, sub):

686

d = dict(sub)

687

nd = noutputs[0].type.ndim

688

d.update(locals())

689

sio = StringIO.StringIO()

690

nin = len(inputs)

691

nout = len(outputs)

692

fail = sub['fail']

693

opname = str(self.scalar_op)

694

initial_dims = ','.join('1' for i in xrange(nd))

695

if 1 or self.scalar_op == scalar.pow:

696

print >> sio, """

697

//std::cerr << "C_CODE %(opname)s START\\n";

698

//standard elemwise size checks

699

""" % locals()

700

print >> sio, """

701

int dims[%(nd)s] = {%(initial_dims)s};

702

""" % locals()

703

704

#check that all inputs have valid dimensions

705

emitted_inames = {}

706

for id, iname in enumerate(inputs):

707

if iname in emitted_inames:

708

assert emitted_inames[iname] is ninputs[id]

709

continue

710

broadcasts = ', '.join(map(str, map(int,

711

ninputs[id].broadcastable)))

712

nd = ninputs[id].ndim

713

print >> sio, """

714

int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s};

715

""" % locals()

716

emitted_inames[iname] = ninputs[id]

717

#check that all inputs have valid dimensions

718

emitted_inames = {}

719

for id, iname in enumerate(inputs):

720

if iname in emitted_inames:

721

continue

722

print >> sio, """

723

//std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";

724

if (%(nd)s != %(iname)s->nd)

725

{

726

PyErr_Format(PyExc_TypeError, "need %(nd)s dims, not %%i", %(iname)s->nd);

727

%(fail)s;

728

}

729

for (int i = 0; i< %(nd)s; ++i)

730

{

731

dims[i] = (dims[i] == 1) ? CudaNdarray_HOST_DIMS(%(iname)s)[i] : dims[i];

732

if ((!(broadcasts_%(iname)s[i] && CudaNdarray_HOST_DIMS(%(iname)s)[i] == 1))&& (dims[i] != CudaNdarray_HOST_DIMS(%(iname)s)[i]))

733

{

734

//std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";

735

PyErr_Format(PyExc_ValueError, "GpuElemwise. Input dimension mis-match. One of your inputs has shape[%%i] == %%i, but the output's size on that axis is %%i.",

736

737

CudaNdarray_HOST_DIMS(%(iname)s)[i],

738

dims[i]

739

);

740

%(fail)s;

741

}

742

}

743

""" % locals()

744

emitted_inames[iname] = True

745

746

#check that all outputs have valid dimensions

747

for idx, oname in enumerate(outputs):

748

if idx not in self.inplace_pattern.keys():

749

print >> sio, """

750

for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {

751

if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i])

752

{

753

Py_DECREF(%(oname)s);

754

%(oname)s = NULL;

755

}

756

}

757

if (NULL == %(oname)s)

758

{

759

%(oname)s = (CudaNdarray*)CudaNdarray_New();

760

if (!%(oname)s)

761

{

762

//error string already set

763

%(fail)s;

764

}

765

if (CudaNdarray_alloc_contiguous(%(oname)s, %(nd)s, dims))

766

{

767

//error string already set

768

Py_DECREF(%(oname)s);

769

%(oname)s = NULL;

770

%(fail)s;

771

}

772

}

773

//std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n";

774

//std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";

775

""" % locals()

776

else:

777

input_idx = self.inplace_pattern[idx]

778

iname = inputs[input_idx]

779

print >> sio, """

780

Py_XDECREF(%(oname)s);

781

%(oname)s = %(iname)s;

782

Py_INCREF(%(oname)s);

783

for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {

784

if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i])

785

{

786

Py_DECREF(%(oname)s);

787

%(oname)s = NULL;

788

%(fail)s;

789

}

790

}

791

//std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n";

792

//std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";

793

""" % locals()

794

795

print >> sio, """

796

{

797

//new block so that failure gotos don't skip over variable initialization

798

//std::cerr << "calling callkernel\\n";

799

if (callkernel_%(nodename)s(1, 0, dims

800

""" % locals()

801

for iname in inputs:

802

print >> sio, """

803

, CudaNdarray_DEV_DATA(%(iname)s), CudaNdarray_HOST_STRIDES(%(iname)s)

804

""" % locals()

805

for oname in outputs:

806

print >> sio, """

807

, CudaNdarray_DEV_DATA(%(oname)s), CudaNdarray_HOST_STRIDES(%(oname)s)

808

""" % locals()

809

print >> sio, """

810

))

811

{

812

// error

813

"""

814

for oname in outputs:

815

print >> sio, """

816

Py_DECREF(%(oname)s);

817

%(oname)s = NULL;

818

""" % locals()

819

print >> sio, """

820

%(fail)s;

821

}

822

else // no error

823

{

824

}

825

}

826

//std::cerr << "C_CODE %(opname)s END\\n";

827

""" % locals()

828

#print sio.getvalue()

829

return sio.getvalue()

830

831

def c_support_code(self):

832

return """

833

#define INTDIV_POW2(a, b) (a >> b)

834

#define INTMOD_POW2(a, b) (a & ((1<<b)-1))

835

"""

836

837

def dummy_holder_for_code_not_used():

838

839

def c_src_kernel_tiling(self, inputs, outputs, nodename):

840

""" The kernel applies to problems with <= 5 dimensions """

841

842

#The kernel is intended to be structured roughly like this:

843

"""

844

static __global__ void kernel()

845

{

846

for (int v = blockIdx.y; v < dim0; v += gridDim.x)

847

{

848

for (int w = blockIdx.y; w < dim1; w += gridDim.y)

849

{

850

for (int x = threadIdx.x; x < dim2; x += blockDim.x)

851

{

852

for (int y = threadIdx.y; y < dim3; y += blockDim.y)

853

{

854

for (int z = threadIdx.z; z < dim4; z += blockDim.z)

855

{

856

out[v * out_stride[0] + ...] = f(in1[...], in2[...])

857

}

858

}

859

}

860

}

861

}

862

}

863

864

"""

865

866

nd = outputs[0].type.ndim

867

sio = StringIO.StringIO()

868

#print 'C_SRC_KERNEL', sio.getvalue()

869

870

if nd in (4,):

871

# print some leading comments to make the code easier to read

872

for ipos, i in enumerate(inputs):

873

print >> sio, "// Input ", ipos, str(i.type)

874

for ipos, i in enumerate(outputs):

875

print >> sio, "// Output ", ipos, str(i.type)

876

print >> sio, """static __global__ void kernel_%s_%s(

877

unsigned int numEls""" % (

878

nodename,

879

'tiling%i' % nd)

880

if (nd):

881

print >> sio, "\t,", ", ".join("const int dim%i" % i

882

for i in xrange(nd))

883

#declare inputs

884

for ipos, i in enumerate(inputs):

885

s = ", ".join(["const float * i%i_data" % ipos] + list(

886

"int i%i_str_%i" % (ipos, d) for d in xrange(nd)))

887

print >> sio, "\t,", s

888

#declare outputs

889

for ipos, i in enumerate(outputs):

890

s = ", ".join(["float * o%i_data" % ipos] + list(

891

"int o%i_str_%i" % (ipos, d) for d in xrange(nd)))

892

print >> sio, "\t,", s

893

#print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))

894

#print >> sio, "\t,", "float * o%i_data" % ipos

895

print >> sio, "\t)\n{"

896

897

# For each input that is a scalar which has been broadcasted to a tensor,

898

# load it into a local variable

899

print >> sio, " __shared__ float value0[%i];" % len(inputs)

900

print >> sio, " __shared__ int shared_dims[%(nd)s];" % locals()

901

#print >> sio, " __shared__ int shared_i_str[%(n_in)s][%(nd)s]"

902

print >> sio, " if ((threadIdx.x == 0) && (threadIdx.y == 0)) {"

903

for ipos, i in enumerate(inputs):

904

if _logical_scalar(i):

905

print >> sio, " value0[%i] = i%i_data[0];" % (ipos,

906

ipos)

907

for ipos in xrange(nd):

908

print >> sio, " shared_dims[%i] = dim%i;" % (ipos, ipos)

909

print >> sio, " }"

910

print >> sio, " __syncthreads();"

911

912

if (nd == 4):

913

print >> sio, """

914

for (int pos0 = blockIdx.x; pos0 < shared_dims[0]; pos0 += gridDim.x)

915

{

916

for (int pos1 = blockIdx.y; pos1 < shared_dims[1]; pos1 += gridDim.y)

917

{

918

//for (int pos2 = threadIdx.x; pos2 < shared_dims[2]; pos2 += blockDim.x)

919

for (int pos2 = threadIdx.y; pos2 < shared_dims[2]; pos2 += blockDim.y)

920

{

921

//for (int pos3 = threadIdx.y; pos3 < shared_dims[3]; pos3 += blockDim.y)

922

for (int pos3 = threadIdx.x; pos3 < shared_dims[3]; pos3 += blockDim.x)

923

{

924

"""

925

else:

926

raise NotImplementedError()

927

928

for ipos, i in enumerate(inputs):

929

if not _logical_scalar(i):

930

print >> sio, " const float * ii_i%i_data = i%i_data;" % (ipos, ipos)

931

for ipos, i in enumerate(outputs):

932

print >> sio, " float * ii_o%i_data = o%i_data;" % (ipos, ipos)

933

for d in xrange(nd):

934

for ipos, i in enumerate(inputs):

935

if not _logical_scalar(i):

936

print >> sio, " ii_i%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d)

937

for ipos, i in enumerate(outputs):

938

print >> sio, " ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d)

939

940

# perform the scalar operation on the input and output references

941

#TODO: What if the scalar_op needs support_code??

942

self.task_code(inputs, outputs, sio, nodename,

943

iname=get_str_list_logical_scalar(

944

inputs, value_str='value0[%i]'))

945

print >> sio, " }" * nd

946

947

#TODO: insert runtime stride checks that select the best loop order either here, or in

948

# the host code that launched the kernel (host code probably better spot)

949

950

#indent = " "*(4*d+7)

951

#for ipos, i in enumerate(inputs):

952

#print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''

953

print >> sio, "}"

954

955

print sio.getvalue()

956

return sio.getvalue()

957

958

def c_src_kernel_tiling_less_registers(self, inputs, outputs, nodename):

959

""" The kernel applies to problems with <= 5 dimensions """

960

961

nd = outputs[0].type.ndim

962

n_in = len(inputs)

963

n_out = len(outputs)

964

sio = StringIO.StringIO()

965

966

if nd not in (2,):

967

return sio.getvalue()

968

969

# print some leading comments to make the code easier to read

970

for ipos, i in enumerate(inputs):

971

print >> sio, "// Input ", ipos, str(i.type)

972

for ipos, i in enumerate(outputs):

973

print >> sio, "// Output ", ipos, str(i.type)

974

print >> sio, "static __global__ void kernel_%s_%s(unsigned int numEls" %(

975

nodename,

976

'tiling%i_less_registers'%nd)

977

if (nd):

978

print >> sio, "\t,", ", ".join("const int dim%i" % i

979

for i in xrange(nd))

980

#declare inputs

981

for ipos, i in enumerate(inputs):

982

s = ", ".join(["const float * i%i_data_0" % ipos] + list(

983

"int i%i_str_%i" % (ipos, d) for d in xrange(nd)))

984

print >> sio, "\t,", s

985

#declare outputs

986

for ipos, i in enumerate(outputs):

987

s = ", ".join(["float * o%i_data_0" % ipos] + list(

988

"int o%i_str_%i" % (ipos, d) for d in xrange(nd)))

989

print >> sio, "\t,", s

990

#print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))

991

#print >> sio, "\t,", "float * o%i_data" % ipos

992

print >> sio, "\t)\n{"

993

994

# TODO: Setting these to true makes the function fail SOMETIMES. I don't know why yet.

995

use_shared_stride = False

996

use_shared_limits = False

997

998

def decl_limits(nd):

999

if use_shared_limits:

1000

print >> sio, "__shared__ float * limits[%(nd)s];" % locals()

1001

1002

def stride(io, p, d):

1003

if use_shared_stride:

1004

return "s%s_str[%i][%i]" % (io, p, d)

1005

else:

1006

return "%s%i_str_%i" % (io, p, d)

1007

1008

def limits(d):

1009

if use_shared_limits:

1010

return "limits[%i]" % d

1011

else:

1012

return "limits%i" % d

1013

1014

def decl_shared_stride(nin, nout, nd):

1015

if not use_shared_stride:

1016

return

1017

print >> sio, """

1018

__shared__ int si_str[%(nin)s][%(nd)s];

1019

__shared__ int so_str[%(nout)s][%(nd)s];

1020

if ((threadIdx.x == 0) && (threadIdx.y == 0)) {

1021

""" % locals()

1022

for i in xrange(nin):

1023

for d in xrange(nd):

1024

print >> sio, "si_str[%(i)s][%(d)s] = i%(i)s_str_%(d)s;" % locals()

1025

for i in xrange(n_out):

1026

for d in xrange(nd):

1027

print >> sio, "so_str[%(i)s][%(d)s] = o%(i)s_str_%(d)s;" % locals()

1028

print >> sio, "} __syncthreads();"

1029

1030

def calc_limit(d):

1031

s = stride('o', 0, d)

1032

lname = limits(d)

1033

if use_shared_limits:

1034

print >> sio, "if ((threadIdx.x == 0) && (threadIdx.y == 0)) {"

1035

if d == 0:

1036

print >> sio, "%(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals()

1037

else:

1038

dm1 = d - 1

1039

print >> sio, "%(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals()

1040

print >> sio, "} __syncthreads();"

1041

else:

1042

if d == 0:

1043

print >> sio, "const float * %(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals()

1044

else:

1045

dm1 = d - 1

1046

print >> sio, "const float * %(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals()

1047

1048

def decl_ptrs(d, offset):

1049

dm1 = d - 1

1050

assert dm1 >= 0

1051

for i in xrange(n_in):

1052

s = stride('i', i, d)

1053

print >> sio, "const float * i%(i)s_data_%(d)s = i%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals()

1054

for i in xrange(n_out):

1055

s = stride('o', i, d)

1056

print >> sio, "float * o%(i)s_data_%(d)s = o%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals()

1057

1058

def inc_ptrs(d, amt):

1059

for i in xrange(n_in):

1060

s = stride('i', i, d)

1061

print >> sio, "i%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals()

1062

for i in xrange(n_out):

1063

s = stride('o', i, d)

1064

print >> sio, "o%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals()

1065

1066

def while_limit(d):

1067

lname = limits(d)

1068

print >> sio, "while (o0_data_%(d)s < %(lname)s) { " % locals()

1069

1070

def end_while(d):

1071

print >> sio, "}"

1072

1073

def task_code(d):

1074

self.task_code(inputs, outputs, sio, nodename,

1075

iname=['i%i_data_%i[0]' % (ipos, d)

1076

for ipos, i in enumerate(inputs)],

1077

oname=['o%i_data_%i[0]' % (ipos, d)

1078

for ipos, i in enumerate(outputs)])

1079

1080

if nd == 4:

1081

decl_shared_stride(n_in, n_out, nd)

1082

decl_limits(nd)

1083

calc_limit(0)

1084

inc_ptrs(0, 'blockIdx.x')

1085

while_limit(0)

1086

if 1:

1087

calc_limit(1)

1088

decl_ptrs(1, 'blockIdx.y')

1089

while_limit(1)

1090

if 1:

1091

calc_limit(2)

1092

decl_ptrs(2, 'threadIdx.y')

1093

while_limit(2)

1094

if 1:

1095

calc_limit(3)

1096

decl_ptrs(3, 'threadIdx.x')

1097

while_limit(3)

1098

if 1:

1099

task_code(3)

1100

inc_ptrs(3, 'blockDim.x')

1101

end_while(3)

1102

inc_ptrs(2, 'blockDim.y')

1103

end_while(2)

1104

inc_ptrs(1, 'gridDim.y')

1105

end_while(1)

1106

inc_ptrs(0, 'gridDim.x')

1107

end_while(0)

1108

1109

print >> sio, "}"

1110

print sio.getvalue()

1111

return sio.getvalue()

1112

1113

1114

def elemwise_collapses(inputs, outputs, out_shape=None, verbose=0):

1115

"""

1116

This collapse dimensions that are not needed when computing

1117

elemwise. This is usefull as it lower the indexing computation

1118

that is heavier on gpu then on cpu.

1119

1120

This is a generic version. It collapse dimensions at any place in

1121

the shape. It handle broadcasted dimensions correctly.

1122

1123

There is no special handling needed for broadcasted scalar at this level.

1124

1125

@return: ndims, tuple(dims, strides) after collapsing.

1126

"""

1127

in_out = inputs + outputs

1128

del inputs

1129

if out_shape is not None:

1130

local_dims = tuple(out_shape)

1131

else:

1132

# TODO, use the right algo here or make the parameter not optional

1133

# We should always have the same shape for all outputs

1134

# If there is more then one outputs

1135

local_dims = tuple(outputs[0].shape)

1136

del outputs

1137

nd_orig = len(local_dims)

1138

if nd_orig == 1:

1139

# This have a lower overhead

1140

all_c_contig = True

1141

for inp in in_out:

1142

if not inp.flags['C_CONTIGUOUS'] or inp.shape != local_dims:

1143

all_c_contig = False

1144

break

1145

if all_c_contig:

1146

return 0, (local_dims, [])

1147

1148

collapsable = [1] * nd_orig

1149

1150

local_str = [None] * len(in_out)

1151

nd_collapse = nd_orig

1152

for ipos in xrange(len(in_out)):

1153

inp = in_out[ipos]

1154

assert len(inp.shape) == nd_orig, "All inputs/outputs must have the same number of dimensions. You must broadcast before calling elemwise_collapse"

1155

local_str[ipos] = list(inp.strides)

1156

# We set the strides of broacastable dims to 0

1157

# This make indexing in gpu simpler and is needed

1158

# For collapsing the dimensions.

1159

for dim_pos in range(inp.ndim):

1160

if inp.shape[dim_pos] == 1:

1161

local_str[ipos][dim_pos] = 0

1162

1163

if nd_orig == 1:

1164

# We already covered the contiguous case before

1165

# So we are sure it is not contiguous

1166

# TODO: Add a test that f contiguous are also collapsed by the first case.

1167

# I think that for 1d array when the flags f contiguous is true, c contiguous is also true.

1168

return 1, (local_dims, local_str)

1169

1170

if verbose > 2:

1171

print "before broadcast collapse"

1172

print " nd_collapse", nd_collapse

1173

print " local_dims", local_dims

1174

for ipos in xrange(len(local_str)):

1175

print " local_str inputs", ipos, local_str[ipos]

1176

local_dims = list(local_dims)

1177

# Collapse dimension that are broadcast in all inputs.

1178

# need to be done before contiguous collapse as it will break it.

1179

# Update the dimensions and the strides

1180

for id in range(nd_collapse):

1181

if local_dims[id] == 1:

1182

# remove dims i from the array

1183

for j in range(id + 1, nd_collapse):

1184

local_dims[j - 1] = local_dims[j]

1185

# remove dims i from the array

1186

for input_id in range(len(in_out)):

1187

for j in range(id + 1, nd_collapse):

1188

local_str[input_id][j - 1] = local_str[input_id][j]

1189

nd_collapse -= 1

1190

id -= 1 # TODO: what is this? How this work?

1191

1192

if verbose > 2:

1193

print "after broadcast collapse"

1194

print " nd_collapse", nd_collapse

1195

print " local_dims", local_dims

1196

for ipos in xrange(len(local_str)):

1197

print " local_str inputs", ipos, local_str[ipos]

1198

1199

nd_collapse_ = [1] * nd_orig

1200

for ipos in xrange(len(local_str)):

1201

# Can we collapse dims[i] and dims[i-1]?

1202

strides = local_str[ipos]

1203

for i in range(nd_collapse - 1, 0, -1):

1204

if strides[i] * local_dims[i] != strides[i - 1]:

1205

# The dims nd-1 are not strided again dimension nd

1206

nd_collapse_[i] = 0

1207

1208

if verbose > 1:

1209

print "nd_collapse_", nd_collapse_

1210

1211

nd_collapse2 = nd_collapse

1212

for i in range(nd_collapse - 1, 0, -1):

1213

if nd_collapse_[i] == 1:

1214

# update the local dims.

1215

local_dims[i - 1] *= local_dims[i]

1216

for j in range(i + 1, nd_collapse):

1217

local_dims[j - 1] = local_dims[j]

1218

1219

# update the local stride.

1220

for ipos in xrange(len(local_str)):

1221

local_str[ipos][i - 1] = local_str[ipos][i] # set new strides

1222

# remove stride i from the array

1223

for j in range(i + 1, nd_collapse):

1224

local_str[ipos][j - 1] = local_str[ipos][j]

1225

1226

# update the new number of dim

1227

nd_collapse2 -= 1

1228

nd_collapse = nd_collapse2

1229

1230

if nd_collapse == 1:

1231

l = [local_str[ipos][nd_collapse - 1] == in_out[ipos].itemsize

1232

for ipos in range(len(local_str))]

1233

if all(l):

1234

nd_collapse = 0

1235

1236

if verbose:

1237

print "end collapsing"

1238

print " nd_collapse", nd_collapse

1239

if verbose > 1:

1240

print " local_dims", local_dims

1241

for ipos in xrange(len(local_str)):

1242

print " local_str inputs", ipos, local_str[ipos]

1243

1244

return nd_collapse, (local_dims, local_str)

1245

1246

1247

def reduction_collapses(inout, axis, verbose=0):

1248

"""

1249

This collapse dimensions that are not needed when computing

1250

reduction. This is usefull as it lower the indexing computation

1251

that is heavier on gpu then on cpu.

1252

1253

This is a generic version. It collapse dimensions at any place in

1254

the shape.

1255

@param: inout: tuple(input, output)

1256

@param: axis: None, interger, list of 1 interger

1257

The axis over witch we will do reduction.

1258

@return: (ndims, (input dims, input strides, input pattern), out strides)

1259

after collapsing.

1260

1261

:note: we suppose that we can always collapse the output dimensions.

1262

"""

1263

input = inout[0]

1264

out = inout[1]

1265

# Some quick check. It is faster then the full version.

1266

if axis is None:

1267

# The output size is always 1, so we don't care about this strides

1268

if (input.flags['C_CONTIGUOUS'] or input.flags['F_CONTIGUOUS']):

1269

return 0, ((input.size,), (input.itemsize,), axis), (0,)

1270

if input.ndim == 1:

1271

assert axis == [0] or axis == 0 or axis is None

1272

# not c contiguous as the first if should have catched it.

1273

return 1, (input.shape, input.strides, axis), (0,)

1274

1275

if not isinstance(axis, (list, tuple)):

1276

local_axis = [axis]

1277

else:

1278

local_axis = list(axis)

1279

1280

# This is needed for the computing of the output strides

1281

assert axis is None or len(local_axis) == 1

1282

1283

local_dims = list(input.shape)

1284

local_str = list(input.strides)

1285

out_strides = list(out.strides)

1286

1287

nd_orig = len(local_dims)

1288

collapsable = [1] * nd_orig

1289

nd_collapse = nd_orig

1290

1291

if verbose > 2:

1292

print "before broadcast collapse"

1293

print " nd_collapse", nd_collapse

1294

print " local_dims", local_dims

1295

print " local_str inputs", local_str

1296

print " local_axis", local_axis

1297

1298

# Collapse dimension that are broadcast in all inputs.

1299

# need to be done before contiguous collapse as it will break it.

1300

# Update the dimensions and the strides

1301

for id in range(nd_collapse):

1302

if local_dims[id] == 1:

1303

for j in range(id + 1, nd_collapse):

1304

# remove dims i from the array

1305

local_dims[j - 1] = local_dims[j]

1306

# remove strides i from the array

1307

local_str[j - 1] = local_str[j]

1308

# remove output strides i from the array

1309

if axis is not None:

1310

out_strides[j - 2] = out_strides[j - 1]

1311

if id in local_axis:

1312

local_axis.remove(id)

1313

for axis_pos in range(len(local_axis)):

1314

if local_axis[axis_pos] > id:

1315

local_axis[axis_pos] -= 1

1316

1317

nd_collapse -= 1

1318

id -= 1 # TODO: how this work?

1319

1320

if verbose > 2:

1321

print "after broadcast collapse"

1322

print " nd_collapse", nd_collapse

1323

print " local_dims", local_dims

1324

print " local_str inputs", local_str

1325

print " local_axis", local_axis

1326

print " out_strides", out_strides

1327

1328

nd_collapse_ = [1] * nd_orig

1329

# Can we collapse dims[i] and dims[i-1]?

1330

for i in range(nd_collapse - 1, 0, -1):

1331

if ((local_str[i] * local_dims[i] != local_str[i - 1])):

1332

# The dims nd-1 are not strided again dimension nd

1333

nd_collapse_[i] = 0

1334

elif (i in local_axis) != ((i - 1) in local_axis):

1335

nd_collapse_[i] = 0

1336

1337

if verbose > 1:

1338

print "nd_collapse_", nd_collapse_

1339

1340

nd_collapse2 = nd_collapse

1341

for i in range(nd_collapse - 1, 0, -1):

1342

if nd_collapse_[i] == 1:

1343

# update the local dims.

1344

local_dims[i - 1] *= local_dims[i]

1345

# set new strides

1346

local_str[i - 1] = local_str[i]

1347

#remove the old dims and strides

1348

for j in range(i + 1, nd_collapse):

1349

local_dims[j - 1] = local_dims[j]

1350

local_str[j - 1] = local_str[j]

1351

if axis is not None:

1352

out_strides[j - 2] = out_strides[j - 1]

1353

1354

if i in local_axis:

1355

local_axis.remove(i)

1356

for axis_pos in range(len(local_axis)):

1357

if local_axis[axis_pos] > i:

1358

local_axis[axis_pos] -= 1

1359

1360

# update the new number of dim

1361

nd_collapse2 -= 1

1362

1363

nd_collapse = nd_collapse2

1364

1365

if nd_collapse == 1:

1366

if local_str[nd_collapse - 1] == input.itemsize:

1367

nd_collapse = 0

1368

1369

if verbose:

1370

print "end collapsing"

1371

print " nd_collapse", nd_collapse

1372

if verbose > 1:

1373

print " local_dims", local_dims

1374

print " local_str inputs", local_str

1375

print " local_axis", local_axis

1376

print " out_strides", out_strides

1377

1378

#print input.shape, input.strides

1379

#print nd_collapse, (local_dims, local_str, local_axis)

1380

local_dims = local_dims[:nd_collapse]

1381

local_str = local_str[:nd_collapse]

1382

out_strides = out_strides[:nd_collapse]

1383

return nd_collapse, (local_dims, local_str, local_axis), out_strides

1384

1385

1386

def call_elemwise(fct, input_vals, block=None, grid=None, out=None,

1387

out_shape=None,

1388

strides=None):

1389

""" Call an elemwise gpu function with gived inputs and block size.

1390

1391

:param fct: The gpu function to call

1392

:param input_vals: a list of inputs to pass to fct

1393

:param block: int, the size of the block wanted

1394

:param grid: int, the size of the grid wanted

1395

:param out: Optional, the preallocated output. Must have the right shape

1396

and dtype.

1397

1398

:param out_shape: Optional, if provided, we will suppose that the output,

1399

have this shape event if it is not true.

1400

:param strides: Optional, if provided, we will use those strides for

1401

the inputs and outputs.

1402

1403

:note: param out_shape and strides are used for the collapsing of

1404

dimensions.

1405

"""

1406

inp = input_vals[0]

1407

1408

# Get the output and output shape to us

1409

if out_shape is None and out is None:

1410

out_shape = list(inp.shape)

1411

for i in input_vals[1:]:

1412

# dtype checked by pycuda before gpu call

1413

for s_i in range(len(inp.shape)):

1414

assert (inp.shape[s_i] == i.shape[s_i]

1415

or inp.shape[s_i] == 1

1416

or i.shape[s_i] == 1)

1417

out_shape[s_i] = max(out_shape[s_i], inp.shape[s_i],

1418

i.shape[s_i])

1419

if out is None:

1420

out = gpu_ndarray.empty(out_shape, dtype=inp.dtype)

1421

elif out_shape is None:

1422

out_shape = out.shape

1423

1424

# Arg: nb element

1425

args = [cast_uint(out.size)]

1426

# Arg: output shape to the arguments.

1427

for i in range(len(out_shape)):

1428

args.append(cast_int(out_shape[i]))

1429

1430

# for each inputs and the output

1431

# add its ptr and strides

1432

nd = len(out_shape)

1433

idx = 0

1434

for i in list(input_vals) + [out]:

1435

itemsize = i.dtype.itemsize

1436

args.append(i)

1437

for j in range(nd):

1438

# We force a stride of 0 for broadcastable dimensions

1439

# This lower the index computation in the kernel.

1440

if strides is not None:

1441

# strides should have a strides of 0 for broadcasting.

1442

args.append(cast_int(strides[idx][j] / itemsize))

1443

elif i.shape[j] == 1:

1444

args.append(cast_int(0))

1445

else:

1446

args.append(cast_int(i.strides[j] / itemsize))

1447

idx += 1

1448

out_size = out.size

1449

# First use at least a full warp

1450

if block is None:

1451

block_ = min(32, out_size)

1452

else:

1453

block_ = block

1454

# Next start adding multiprocessors

1455

if grid is None:

1456

grid_ = min(out_size / block_ + (out_size % block_ != 0), 60)

1457

else:

1458

grid_ = grid

1459

# Next start adding more warps per multiprocessor

1460

if block is None:

1461

if block_ * grid_ < out_size:

1462

block_ = min(out_size / grid_, 512)

1463

1464

# We bypass the pycuda wrapper gpu function call.

1465

# by calling directly the gpu function.

1466

# This is faster and lower the overhead.

1467

# Here is code that allow you to use the pycuda fct call.

1468

# d = {"block":(block_,1,1), "grid":(grid_,1)}

1469

# fct(*args, **d)

1470

fct.set_block_shape(block_, 1, 1) # time_kernel

1471

fct.param_set(*args)

1472

fct.launch_grid(grid_, 1)

1473

return out

1474

1475

1476

class MyGpuNdArray():

1477

_compiled_fct = {}

1478

1479

def __init__(self, gpu_nd_array):

1480

#assert isinstance(gpu_nd_array, gpu_ndarray.GpuNdArrayObject)

1481

self.gpu_nd_array = gpu_nd_array

1482

self.ctype = dtype_to_ctype(self.gpu_nd_array.dtype)

1483

1484

@staticmethod

1485

def gen_fct(op, inputs, nd, nodename="TestNodeName",

1486

collapse=True):

1487

if _CL_MODE:

1488

npy_ty = "typedef float npy_float32;\n"

1489

else:

1490

npy_ty = "typedef double npy_float64;\n typedef float npy_float32;\n"

1491

1492

# Generate the gpu functions

1493

nb_in = len(inputs)

1494

fcts = [None]

1495

for nd in range(1, nd + 1): # 1 to nd

1496

out = op(*[TensorType(i.gpu_nd_array.dtype,

1497

(False,) * nd)() for i in inputs])

1498

out_dtype = out.dtype

1499

node = out.owner

1500

elemwise_algo = ElemwiseAlgo(node.op.scalar_op)

1501

1502

code = (CLUDA_PREAMBLE +

1503

npy_ty +

1504

elemwise_algo.c_src_kernel(node.inputs,

1505

node.outputs,

1506

nodename, nd,

1507

static=""))

1508

fct_name = "kernel_%s_%d" % (nodename, nd)

1509

fct = compile_gpu_code(code, fct_name)

1510

fcts.append(fct)

1511

1512

# All inputs/outputs C contiguous case

1513

code = (npy_ty +

1514

CLUDA_PREAMBLE +

1515

elemwise_algo.c_src_kernel_Ccontiguous(

1516

node.inputs, node.outputs, nodename, static=""))

1517

fct_name = "kernel_%s_Ccontiguous" % nodename

1518

fcts[0] = compile_gpu_code(code, fct_name)

1519

1520

def call_fct2(inputs, out=None):

1521

" Do dimensions collapsing before call the gpu code "

1522

assert len(inputs) == nb_in

1523

# dtype checked by pycuda

1524

# TODO: assert nb dim?

1525

1526

inp = inputs[0]

1527

1528

# Compute the output shape.

1529

out_shape = list(inp.shape)

1530

for i in inputs[1:]:

1531

for s_i in range(len(inp.shape)):

1532

assert (inp.shape[s_i] == i.shape[s_i]

1533

or inp.shape[s_i] == 1

1534

or i.shape[s_i] == 1)

1535

out_shape[s_i] = max(out_shape[s_i], i.shape[s_i])

1536

# Create the output object

1537

if (out is None

1538

or out.dtype != out_dtype

1539

or out.shape != tuple(out_shape)):

1540

out = MyGpuNdArray(gpu_ndarray.empty(out_shape,

1541

dtype=out_dtype))

1542

1543

if collapse:

1544

# Do the collapsing.

1545

nd_col, info = elemwise_collapses(list(inputs), [out])

1546

# The two next line are usefull to force a call to the

1547

# c contiguous version:

1548

#nd_col = 0

1549

#info = [[],[]]

1550

out = call_elemwise(fcts[nd_col], inputs,

1551

out=out, out_shape=info[0][:nd_col],

1552

strides=info[1])

1553

else:

1554

out = call_elemwise(fcts[-1], inputs, out=out,

1555

out_shape=out_shape)

1556

return out

1557

return call_fct2

1558

1559

def __elemwise2__(self, other, name, op):

1560

""" Call this code on this op with 2 inputs """

1561

nd = len(self.gpu_nd_array.shape) # self.gpu_nd_array.ndim

1562

assert nd == len(other.gpu_nd_array.shape) # ndim

1563

tag = (name + '_' + str(self.gpu_nd_array.dtype)

1564

+ str(self.gpu_nd_array.ndim))

1565

tag += ('_' + str(other.gpu_nd_array.dtype)

1566

+ str(other.gpu_nd_array.ndim))

1567

fct = self._compiled_fct.get(tag, None)

1568

if fct is None:

1569

# print "compile", tag

1570

fct = MyGpuNdArray.gen_fct(op, [self, other], nd)

1571

self._compiled_fct[tag] = fct

1572

return fct((self, other))

1573

1574

@classmethod

1575

def __elemwise__(cls, inputs, name, op, out=None):

1576

""" Call this code on this op with * inputs """

1577

nd = len(inputs[0].gpu_nd_array.shape) # self.gpu_nd_array.ndim

1578

for i in inputs[1:]:

1579

assert nd == len(i.gpu_nd_array.shape) # ndim

1580

nb = len(inputs)

1581

tag = name + "_".join([str(i.gpu_nd_array.dtype) +

1582

str(i.gpu_nd_array.ndim) for i in inputs])

1583

fct = cls._compiled_fct.get(tag, None)

1584

if fct is None:

1585

# print "compile", tag

1586

fct = MyGpuNdArray.gen_fct(op, inputs, nd)

1587

cls._compiled_fct[tag] = fct

1588

return fct(inputs, out=out)

1589

1590

base = property(lambda self: self.gpu_nd_array.base)

1591

bytes = property(lambda self: self.gpu_nd_array.bytes)

1592

dtype = property(lambda self: self.gpu_nd_array.dtype)

1593

flags = property(lambda self: self.gpu_nd_array.flags)

1594

itemsize = property(lambda self: self.gpu_nd_array.itemsize)

1595

ndim = property(lambda self: self.gpu_nd_array.ndim,

1596

doc="number of dimensions")

1597

offset = property(lambda self: self.gpu_nd_array.offset)

1598

shape = property(lambda self: self.gpu_nd_array.shape)

1599

size = property(lambda self: self.gpu_nd_array.size)

1600

strides = property(lambda self: self.gpu_nd_array.strides)

1601

1602

def __array__(self):

1603

return numpy.asarray(self.gpu_nd_array)

1604

1605

def copy(self):

1606

return MyGpuNdArray(self.gpu_nd_array.copy())

1607

1608

def view(self):

1609

return MyGpuNdArray(self.gpu_nd_array.view())

1610

1611

def __copy__(self):

1612

return MyGpuNdArray(self.gpu_nd_array.__copy__())

1613

1614

def __deepcopy__(self):

1615

return MyGpuNdArray(self.gpu_nd_array.__deepcopy__())

1616

1617

@property

1618

def gpudata(self):

1619

# TODO: Add this assert when PyCUDA/PyOpenCL can use the bytes

1620

# attributes. Without this assert old code that don't support

1621

# strides can receive as input object that are strided and no

1622

# error will be gived

1623

1624

#assert (self.gpu_nd_array.flags['C_CONTIGUOUS'] or

1625

# self.gpu_nd_array.flags['F_CONTIGUOUS'])

1626

1627

# TODO: find a way to pass to a pycuda/pyopencl function the

1628

# bytes + offset directly.

1629

return self.bytes + self.offset

1630

1631

def __getitem__(self, *inputs):

1632

return MyGpuNdArray(self.gpu_nd_array.__getitem__(*inputs))

1633

1634

def __add__(self, other):

1635

return self.__elemwise2__(other, "add", theano.tensor.add)

1636

1637

def __sub__(self, other):

1638

return self.__elemwise2__(other, "sub", theano.tensor.sub)

1639

1640

def __mul__(self, other):

1641

return self.__elemwise2__(other, "mul", theano.tensor.mul)

1642

1643

def __div__(self, other):

1644

assert (str(self.gpu_nd_array.dtype).startswith("float") or

1645

str(other.gpu_nd_array.dtype).startswith("float"))

1646

return self.__elemwise2__(other, "true_div", theano.tensor.true_div)

1647

1648

@classmethod

1649

def add(cls, x, y, out=None):

1650

""" add all inputs togethers element-wise """

1651

return cls.__elemwise__([x, y], "add", theano.tensor.add, out=out)

1652

1653

@classmethod

1654

def adds(cls, *inputs):

1655

""" add all inputs togethers element-wise """

1656

return cls.__elemwise__(inputs, "add", theano.tensor.add)

1657

1658

@classmethod

1659

def multiplys(cls, *inputs):

1660

""" multiply all inputs togethers element-wise """

1661

return cls.__elemwise__(inputs, "mul", theano.tensor.mul)

1662

1663

def sum(self, axis=None, collapse=True):

1664

import gen_reduction

1665

max_thread_per_block = 512

1666

max_block = 4096

1667

if isinstance(axis, (list, tuple)):

1668

if len(axis) == 1:

1669

axis = axis[0]

1670

else:

1671

assert len(axis) == self.ndim

1672

axis.sort()

1673

assert axis == range(self.ndim)

1674

axis = None

1675

1676

# TODO: Why this?

1677

if self.size == 0:

1678

make_out = gpu_ndarray.zeros

1679

else:

1680

make_out = gpu_ndarray.empty

1681

1682

if axis is None:

1683

out = make_out((), self.dtype)

1684

out = MyGpuNdArray(out)

1685

else:

1686

out_shape = [self.shape[i] for i in range(self.ndim)

1687

if i != axis]

1688

out = make_out(out_shape, self.dtype)

1689

out = MyGpuNdArray(out)

1690

1691

if self.size == 0:

1692

return out

1693

1694

args_set = False

1695

1696

if collapse:

1697

coll_ndim, (coll_shape, coll_strides, coll_axis), coll_out_str = (

1698

reduction_collapses([self, out], axis))

1699

else:

1700

coll_ndim = self.ndim

1701

coll_shape = self.shape

1702

coll_strides = self.strides

1703

coll_axis = [axis]

1704

coll_out_str = out.strides

1705

1706

if axis is not None:

1707

coll_axis = coll_axis[0]

1708

1709

args_set = False

1710

1711

if coll_ndim == 0:

1712

sum_op = gen_reduction.GpuSum([1], self.dtype)

1713

c_code = sum_op.c_support_code_apply("nodename", contig=True)

1714

fctname = "kernel_reduce_sum_ccontig_nodename"

1715

fct = compile_gpu_code(c_code, fctname)

1716

block_ = min(coll_shape[0], max_thread_per_block)

1717

block = (block_, 1, 1)

1718

1719

grid = (1, 1)

1720

shared_ = self.dtype.itemsize * block_

1721

args = [cast_int(coll_shape[0]), self, out]

1722

args_set = True

1723

elif axis is None:

1724

pattern = [1] * coll_ndim

1725

str_pattern = [str(i) for i in pattern]

1726

sum_op = gen_reduction.GpuSum(pattern, self.dtype)

1727

c_code = sum_op.c_support_code_apply("nodename")

1728

if not c_code:

1729

raise NotImplementedError(

1730

"GpuNdArray sum case not implemented")

1731

fctname = "kernel_reduce_sum_" + "".join(str_pattern) + "_nodename"

1732

fct = compile_gpu_code(c_code, fctname)

1733

if coll_ndim == 1:

1734

bx = min(max_thread_per_block, coll_shape[0])

1735

block = (bx, 1, 1)

1736

block_ = bx

1737

elif coll_ndim == 2:

1738

bx = min(max_thread_per_block, coll_shape[1])

1739

by = min(max_thread_per_block // coll_shape[1], coll_shape[0])

1740

by = max(by, 1)

1741

block = (bx, by, 1)

1742

block_ = bx * by

1743

elif coll_ndim == 3:

1744

bx = min(max_thread_per_block, coll_shape[2])

1745

by = min(max_thread_per_block // bx, coll_shape[1])

1746

bz = min(max_thread_per_block // (bx * by), coll_shape[0])

1747

by = max(by, 1)

1748

bz = min(max(bz, 1), 64)

1749

block = (bx, by, bz)

1750

block_ = bx * by * bz

1751

elif coll_ndim == 4:

1752

bx = min(max_thread_per_block, coll_shape[3])

1753

by = min(max_thread_per_block // bx, coll_shape[2])

1754

bz = min(max_thread_per_block // (bx * by), coll_shape[1])

1755

by = max(by, 1)

1756

bz = min(max(bz, 1), 64)

1757

block = (bx, by, bz)

1758

block_ = bx * by * bz

1759

grid = (1, 1)

1760

shared_ = self.dtype.itemsize * block_

1761

elif coll_ndim in [1, 2, 3]:

1762

if coll_ndim == 1:

1763

assert coll_axis == 0

1764

# pattern 1

1765

sum_op = gen_reduction.GpuSum([1], self.dtype)

1766

fctname = "kernel_reduce_sum_1_nodename"

1767

1768

grid = (1, 1)

1769

1770

block_ = min(max_thread_per_block, coll_shape[0])

1771

block = (block_, 1, 1)

1772

elif coll_ndim == 3 and coll_axis == 0:

1773

# pattern 100

1774

sum_op = gen_reduction.GpuSum([1, 0, 0], self.dtype)

1775

fctname = "kernel_reduce_sum_100_nodename"

1776

1777

gx = min(coll_shape[1], max_block)

1778

gy = min(max_block // (gx * coll_shape[2]), coll_shape[2])

1779

gy = max(gy, 1)

1780

grid = (gx, gy)

1781

1782

block_ = min(max_thread_per_block, coll_shape[0])

1783

block = (block_, 1, 1)

1784

elif coll_ndim == 3 and coll_axis == 1:

1785

# pattern 010

1786

sum_op = gen_reduction.GpuSum([0, 1, 0], self.dtype)

1787

fctname = "kernel_reduce_sum_010_AD_nodename"

1788

1789

A = coll_shape[0]

1790

B = coll_shape[1]

1791

C = coll_shape[2]

1792

D = C / 32

1793

if (32 * D < C):

1794

D += 1

1795

assert ((C <= 32 * D) and (32 * D < C + 32))

1796

shared_ = 0

1797

1798

gx = min(A, max_block)

1799

gy = min(max_block // (D * A), D)

1800

gy = max(gy, 1)

1801

grid = (gx, gy)

1802

1803

block = (32, 1, 1)

1804

block_ = 32

1805

1806

args_set = True

1807

# input shape

1808

args = [cast_int(A), cast_int(B),

1809

cast_int(C), cast_int(D)]

1810

# input

1811

args.append(self)

1812

# input strides

1813

args += [cast_int(i / self.dtype.itemsize)

1814

for i in coll_strides]

1815

# output

1816

args.append(out)

1817

# output strides

1818

args.append(cast_int(coll_out_str[0] / out.dtype.itemsize))

1819

args.append(cast_int(coll_out_str[1] / out.dtype.itemsize))

1820

elif coll_ndim == 3 and coll_axis == 2:

1821

# pattern 001

1822

sum_op = gen_reduction.GpuSum([0, 0, 1], self.dtype)

1823

fctname = "kernel_reduce_sum_001_nodename"

1824

1825

gx = min(coll_shape[0], max_block)

1826

gy = min(max_block // (gx * coll_shape[1]), coll_shape[1])

1827

gy = max(gy, 1)

1828

grid = (gx, gy)

1829

1830

block_ = min(max_thread_per_block, coll_shape[2])

1831

block = (block_, 1, 1)

1832

elif coll_axis == 0:

1833

# pattern 10

1834

sum_op = gen_reduction.GpuSum([1, 0], self.dtype)

1835

fctname = "kernel_reduce_sum_010_nodename"

1836

block_ = min(coll_shape[1], max_thread_per_block)

1837

block = (block_, 1, 1)

1838

grid = (1, coll_shape[0])

1839

args_set = True

1840

# input shape

1841

args = [cast_int(1)]

1842

args += [cast_int(i) for i in coll_shape]

1843

# input

1844

args.append(self)

1845

# input strides

1846

args.append(cast_int(1))

1847

args += [cast_int(i / self.dtype.itemsize)

1848

for i in coll_strides]

1849

# output

1850

args.append(out)

1851

# output strides

1852

args.append(cast_int(1))

1853

# We must take the last dimensions in the case of

1854

# dimensions collapsing.

1855

args.append(cast_int(coll_out_str[-1] / out.dtype.itemsize))

1856

elif coll_axis == 1:

1857

# pattern 01

1858

sum_op = gen_reduction.GpuSum([0, 1], self.dtype)

1859

fctname = "kernel_reduce_sum_01_nodename"

1860

block_ = min(coll_shape[1], max_thread_per_block)

1861

block = (block_, 1, 1)

1862

grid = (1, min(coll_shape[0], max_block))

1863

else:

1864

raise Exception("Bad axis")

1865

1866

c_code = sum_op.c_support_code_apply("nodename")

1867

fct = compile_gpu_code(c_code, fctname)

1868

1869

shared_ = self.dtype.itemsize * block_

1870

else:

1871

raise Exception("Not implemented")

1872

1873

if not args_set:

1874

# input shape

1875

args = [cast_int(i) for i in coll_shape]

1876

# input

1877

args.append(self)

1878

# input strides

1879

args += [cast_int(i / self.dtype.itemsize)

1880

for i in coll_strides]

1881

# output

1882

args.append(out)

1883

# output strides

1884

args += [cast_int(i / self.dtype.itemsize)

1885

for i in coll_out_str]

1886

1887

pycuda._driver.Context.synchronize()

1888

#print fctname, block, grid, shared_, axis

1889

#print self.ndim, self.shape, self.strides, axis, out.strides

1890

#print coll_ndim, coll_shape, coll_strides, coll_axis, coll_out_str

1891

#print args

1892

1893

if False:

1894

d = {"block": block,

1895

"shared": shared_,

1896

"grid": grid}

1897

fct(*args, **d)

1898

else:

1899

# We bypass the pycuda wrapper gpu function call.

1900

# by calling directly the gpu function.

1901

# This is faster and lower the overhead.

1902

fct.set_block_shape(*block)

1903

fct.set_shared_size(shared_)

1904

fct.param_set(*args)

1905

fct.launch_grid(*grid)

1906

return out

Older »