~siretart/ubuntu/utopic/blender/libav10

« back to all changes in this revision

Viewing changes to intern/cycles/device/device_cuda.cpp

  • Committer: Package Import Robot
  • Author(s): Matteo F. Vescovi
  • Date: 2012-07-23 08:54:18 UTC
  • mfrom: (14.2.16 sid)
  • mto: (14.2.19 sid)
  • mto: This revision was merged to the branch mainline in revision 42.
  • Revision ID: package-import@ubuntu.com-20120723085418-9foz30v6afaf5ffs
Tags: 2.63a-2
* debian/: Cycles support added (Closes: #658075)
  For now, this top feature has been enabled only
  on [any-amd64 any-i386] architectures because
  of OpenImageIO failing on all others
* debian/: scripts installation path changed
  from /usr/lib to /usr/share:
  + debian/patches/: patchset re-worked for path changing
  + debian/control: "Breaks" field added on yafaray-exporter

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*
 
2
 * Copyright 2011, Blender Foundation.
 
3
 *
 
4
 * This program is free software; you can redistribute it and/or
 
5
 * modify it under the terms of the GNU General Public License
 
6
 * as published by the Free Software Foundation; either version 2
 
7
 * of the License, or (at your option) any later version.
 
8
 *
 
9
 * This program is distributed in the hope that it will be useful,
 
10
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 
11
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 
12
 * GNU General Public License for more details.
 
13
 *
 
14
 * You should have received a copy of the GNU General Public License
 
15
 * along with this program; if not, write to the Free Software Foundation,
 
16
 * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 
17
 */
 
18
 
 
19
#include <stdio.h>
 
20
#include <stdlib.h>
 
21
#include <string.h>
 
22
 
 
23
#include "device.h"
 
24
#include "device_intern.h"
 
25
 
 
26
#include "util_cuda.h"
 
27
#include "util_debug.h"
 
28
#include "util_map.h"
 
29
#include "util_opengl.h"
 
30
#include "util_path.h"
 
31
#include "util_system.h"
 
32
#include "util_types.h"
 
33
#include "util_time.h"
 
34
 
 
35
CCL_NAMESPACE_BEGIN
 
36
 
 
37
class CUDADevice : public Device
 
38
{
 
39
public:
 
40
        CUdevice cuDevice;
 
41
        CUcontext cuContext;
 
42
        CUmodule cuModule;
 
43
        map<device_ptr, bool> tex_interp_map;
 
44
        int cuDevId;
 
45
 
 
46
        struct PixelMem {
 
47
                GLuint cuPBO;
 
48
                CUgraphicsResource cuPBOresource;
 
49
                GLuint cuTexId;
 
50
                int w, h;
 
51
        };
 
52
 
 
53
        map<device_ptr, PixelMem> pixel_mem_map;
 
54
 
 
55
        CUdeviceptr cuda_device_ptr(device_ptr mem)
 
56
        {
 
57
                return (CUdeviceptr)mem;
 
58
        }
 
59
 
 
60
        const char *cuda_error_string(CUresult result)
 
61
        {
 
62
                switch(result) {
 
63
                        case CUDA_SUCCESS: return "No errors";
 
64
                        case CUDA_ERROR_INVALID_VALUE: return "Invalid value";
 
65
                        case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory";
 
66
                        case CUDA_ERROR_NOT_INITIALIZED: return "Driver not initialized";
 
67
                        case CUDA_ERROR_DEINITIALIZED: return "Driver deinitialized";
 
68
 
 
69
                        case CUDA_ERROR_NO_DEVICE: return "No CUDA-capable device available";
 
70
                        case CUDA_ERROR_INVALID_DEVICE: return "Invalid device";
 
71
 
 
72
                        case CUDA_ERROR_INVALID_IMAGE: return "Invalid kernel image";
 
73
                        case CUDA_ERROR_INVALID_CONTEXT: return "Invalid context";
 
74
                        case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "Context already current";
 
75
                        case CUDA_ERROR_MAP_FAILED: return "Map failed";
 
76
                        case CUDA_ERROR_UNMAP_FAILED: return "Unmap failed";
 
77
                        case CUDA_ERROR_ARRAY_IS_MAPPED: return "Array is mapped";
 
78
                        case CUDA_ERROR_ALREADY_MAPPED: return "Already mapped";
 
79
                        case CUDA_ERROR_NO_BINARY_FOR_GPU: return "No binary for GPU";
 
80
                        case CUDA_ERROR_ALREADY_ACQUIRED: return "Already acquired";
 
81
                        case CUDA_ERROR_NOT_MAPPED: return "Not mapped";
 
82
                        case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "Mapped resource not available for access as an array";
 
83
                        case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "Mapped resource not available for access as a pointer";
 
84
                        case CUDA_ERROR_ECC_UNCORRECTABLE: return "Uncorrectable ECC error detected";
 
85
                        case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUlimit not supported by device";
 
86
 
 
87
                        case CUDA_ERROR_INVALID_SOURCE: return "Invalid source";
 
88
                        case CUDA_ERROR_FILE_NOT_FOUND: return "File not found";
 
89
                        case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "Link to a shared object failed to resolve";
 
90
                        case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "Shared object initialization failed";
 
91
 
 
92
                        case CUDA_ERROR_INVALID_HANDLE: return "Invalid handle";
 
93
 
 
94
                        case CUDA_ERROR_NOT_FOUND: return "Not found";
 
95
 
 
96
                        case CUDA_ERROR_NOT_READY: return "CUDA not ready";
 
97
 
 
98
                        case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed";
 
99
                        case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "Launch exceeded resources";
 
100
                        case CUDA_ERROR_LAUNCH_TIMEOUT: return "Launch exceeded timeout";
 
101
                        case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "Launch with incompatible texturing";
 
102
 
 
103
                        case CUDA_ERROR_UNKNOWN: return "Unknown error";
 
104
 
 
105
                        default: return "Unknown CUDA error value";
 
106
                }
 
107
        }
 
108
 
 
109
#ifdef NDEBUG
 
110
#define cuda_abort()
 
111
#else
 
112
#define cuda_abort() abort()
 
113
#endif
 
114
 
 
115
#define cuda_assert(stmt) \
 
116
        { \
 
117
                CUresult result = stmt; \
 
118
                \
 
119
                if(result != CUDA_SUCCESS) { \
 
120
                        string message = string_printf("CUDA error: %s in %s", cuda_error_string(result), #stmt); \
 
121
                        if(error_msg == "") \
 
122
                                error_msg = message; \
 
123
                        fprintf(stderr, "%s\n", message.c_str()); \
 
124
                        cuda_abort(); \
 
125
                } \
 
126
        }
 
127
 
 
128
        bool cuda_error(CUresult result)
 
129
        {
 
130
                if(result == CUDA_SUCCESS)
 
131
                        return false;
 
132
 
 
133
                string message = string_printf("CUDA error: %s", cuda_error_string(result));
 
134
                if(error_msg == "")
 
135
                        error_msg = message;
 
136
                fprintf(stderr, "%s\n", message.c_str());
 
137
                return true;
 
138
        }
 
139
 
 
140
        void cuda_error(const string& message)
 
141
        {
 
142
                if(error_msg == "")
 
143
                        error_msg = message;
 
144
                fprintf(stderr, "%s\n", message.c_str());
 
145
        }
 
146
 
 
147
        void cuda_push_context()
 
148
        {
 
149
                cuda_assert(cuCtxSetCurrent(cuContext))
 
150
        }
 
151
 
 
152
        void cuda_pop_context()
 
153
        {
 
154
                cuda_assert(cuCtxSetCurrent(NULL));
 
155
        }
 
156
 
 
157
        CUDADevice(DeviceInfo& info, bool background_)
 
158
        {
 
159
                background = background_;
 
160
 
 
161
                cuDevId = info.num;
 
162
                cuDevice = 0;
 
163
                cuContext = 0;
 
164
 
 
165
                /* intialize */
 
166
                if(cuda_error(cuInit(0)))
 
167
                        return;
 
168
 
 
169
                /* setup device and context */
 
170
                if(cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
 
171
                        return;
 
172
 
 
173
                CUresult result;
 
174
 
 
175
                if(background) {
 
176
                        result = cuCtxCreate(&cuContext, 0, cuDevice);
 
177
                }
 
178
                else {
 
179
                        result = cuGLCtxCreate(&cuContext, 0, cuDevice);
 
180
 
 
181
                        if(result != CUDA_SUCCESS) {
 
182
                                result = cuCtxCreate(&cuContext, 0, cuDevice);
 
183
                                background = true;
 
184
                        }
 
185
                }
 
186
 
 
187
                if(cuda_error(result))
 
188
                        return;
 
189
 
 
190
                cuda_pop_context();
 
191
        }
 
192
 
 
193
        ~CUDADevice()
 
194
        {
 
195
                cuda_push_context();
 
196
                cuda_assert(cuCtxDetach(cuContext))
 
197
        }
 
198
 
 
199
        bool support_device(bool experimental)
 
200
        {
 
201
                if(!experimental) {
 
202
                        int major, minor;
 
203
                        cuDeviceComputeCapability(&major, &minor, cuDevId);
 
204
 
 
205
                        if(major <= 1 && minor <= 2) {
 
206
                                cuda_error(string_printf("CUDA device supported only with compute capability 1.3 or up, found %d.%d.", major, minor));
 
207
                                return false;
 
208
                        }
 
209
                }
 
210
 
 
211
                return true;
 
212
        }
 
213
 
 
214
        string compile_kernel()
 
215
        {
 
216
                /* compute cubin name */
 
217
                int major, minor;
 
218
                cuDeviceComputeCapability(&major, &minor, cuDevId);
 
219
 
 
220
                /* attempt to use kernel provided with blender */
 
221
                string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
 
222
                if(path_exists(cubin))
 
223
                        return cubin;
 
224
 
 
225
                /* not found, try to use locally compiled kernel */
 
226
                string kernel_path = path_get("kernel");
 
227
                string md5 = path_files_md5_hash(kernel_path);
 
228
 
 
229
                cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());;
 
230
                cubin = path_user_get(path_join("cache", cubin));
 
231
 
 
232
                /* if exists already, use it */
 
233
                if(path_exists(cubin))
 
234
                        return cubin;
 
235
 
 
236
#if defined(WITH_CUDA_BINARIES) && defined(_WIN32)
 
237
                if(major <= 1 && minor <= 2)
 
238
                        cuda_error(string_printf("CUDA device supported only compute capability 1.3 or up, found %d.%d.", major, minor));
 
239
                else
 
240
                        cuda_error(string_printf("CUDA binary kernel for this graphics card compute capability (%d.%d) not found.", major, minor));
 
241
                return "";
 
242
#else
 
243
                /* if not, find CUDA compiler */
 
244
                string nvcc = cuCompilerPath();
 
245
 
 
246
                if(nvcc == "") {
 
247
                        cuda_error("CUDA nvcc compiler not found. Install CUDA toolkit in default location.");
 
248
                        return "";
 
249
                }
 
250
 
 
251
                /* compile */
 
252
                string kernel = path_join(kernel_path, "kernel.cu");
 
253
                string include = kernel_path;
 
254
                const int machine = system_cpu_bits();
 
255
                const int maxreg = 24;
 
256
 
 
257
                double starttime = time_dt();
 
258
                printf("Compiling CUDA kernel ...\n");
 
259
 
 
260
                path_create_directories(cubin);
 
261
 
 
262
                string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" --use_fast_math "
 
263
                        "-o \"%s\" --ptxas-options=\"-v\" --maxrregcount=%d --opencc-options -OPT:Olimit=0 -I\"%s\" -DNVCC",
 
264
                        nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), maxreg, include.c_str());
 
265
 
 
266
                if(system(command.c_str()) == -1) {
 
267
                        cuda_error("Failed to execute compilation command, see console for details.");
 
268
                        return "";
 
269
                }
 
270
 
 
271
                /* verify if compilation succeeded */
 
272
                if(!path_exists(cubin)) {
 
273
                        cuda_error("CUDA kernel compilation failed, see console for details.");
 
274
                        return "";
 
275
                }
 
276
 
 
277
                printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
 
278
 
 
279
                return cubin;
 
280
#endif
 
281
        }
 
282
 
 
283
        bool load_kernels(bool experimental)
 
284
        {
 
285
                /* check if cuda init succeeded */
 
286
                if(cuContext == 0)
 
287
                        return false;
 
288
 
 
289
                if(!support_device(experimental))
 
290
                        return false;
 
291
 
 
292
                /* get kernel */
 
293
                string cubin = compile_kernel();
 
294
 
 
295
                if(cubin == "")
 
296
                        return false;
 
297
 
 
298
                /* open module */
 
299
                cuda_push_context();
 
300
 
 
301
                CUresult result = cuModuleLoad(&cuModule, cubin.c_str());
 
302
                if(cuda_error(result))
 
303
                        cuda_error(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
 
304
 
 
305
                cuda_pop_context();
 
306
 
 
307
                return (result == CUDA_SUCCESS);
 
308
        }
 
309
 
 
310
        void mem_alloc(device_memory& mem, MemoryType type)
 
311
        {
 
312
                cuda_push_context();
 
313
                CUdeviceptr device_pointer;
 
314
                cuda_assert(cuMemAlloc(&device_pointer, mem.memory_size()))
 
315
                mem.device_pointer = (device_ptr)device_pointer;
 
316
                cuda_pop_context();
 
317
        }
 
318
 
 
319
        void mem_copy_to(device_memory& mem)
 
320
        {
 
321
                cuda_push_context();
 
322
                cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size()))
 
323
                cuda_pop_context();
 
324
        }
 
325
 
 
326
        void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
 
327
        {
 
328
                size_t offset = elem*y*w;
 
329
                size_t size = elem*w*h;
 
330
 
 
331
                cuda_push_context();
 
332
                cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
 
333
                        (CUdeviceptr)((uchar*)mem.device_pointer + offset), size))
 
334
                cuda_pop_context();
 
335
        }
 
336
 
 
337
        void mem_zero(device_memory& mem)
 
338
        {
 
339
                memset((void*)mem.data_pointer, 0, mem.memory_size());
 
340
 
 
341
                cuda_push_context();
 
342
                cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()))
 
343
                cuda_pop_context();
 
344
        }
 
345
 
 
346
        void mem_free(device_memory& mem)
 
347
        {
 
348
                if(mem.device_pointer) {
 
349
                        cuda_push_context();
 
350
                        cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)))
 
351
                        cuda_pop_context();
 
352
 
 
353
                        mem.device_pointer = 0;
 
354
                }
 
355
        }
 
356
 
 
357
        void const_copy_to(const char *name, void *host, size_t size)
 
358
        {
 
359
                CUdeviceptr mem;
 
360
                size_t bytes;
 
361
 
 
362
                cuda_push_context();
 
363
                cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name))
 
364
                //assert(bytes == size);
 
365
                cuda_assert(cuMemcpyHtoD(mem, host, size))
 
366
                cuda_pop_context();
 
367
        }
 
368
 
 
369
        void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
 
370
        {
 
371
                /* determine format */
 
372
                CUarray_format_enum format;
 
373
                size_t dsize = datatype_size(mem.data_type);
 
374
                size_t size = mem.memory_size();
 
375
 
 
376
                switch(mem.data_type) {
 
377
                        case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
 
378
                        case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
 
379
                        case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
 
380
                        case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
 
381
                        default: assert(0); return;
 
382
                }
 
383
 
 
384
                CUtexref texref;
 
385
 
 
386
                cuda_push_context();
 
387
                cuda_assert(cuModuleGetTexRef(&texref, cuModule, name))
 
388
 
 
389
                if(interpolation) {
 
390
                        CUarray handle;
 
391
                        CUDA_ARRAY_DESCRIPTOR desc;
 
392
 
 
393
                        desc.Width = mem.data_width;
 
394
                        desc.Height = mem.data_height;
 
395
                        desc.Format = format;
 
396
                        desc.NumChannels = mem.data_elements;
 
397
 
 
398
                        cuda_assert(cuArrayCreate(&handle, &desc))
 
399
 
 
400
                        if(mem.data_height > 1) {
 
401
                                CUDA_MEMCPY2D param;
 
402
                                memset(&param, 0, sizeof(param));
 
403
                                param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
 
404
                                param.dstArray = handle;
 
405
                                param.srcMemoryType = CU_MEMORYTYPE_HOST;
 
406
                                param.srcHost = (void*)mem.data_pointer;
 
407
                                param.srcPitch = mem.data_width*dsize*mem.data_elements;
 
408
                                param.WidthInBytes = param.srcPitch;
 
409
                                param.Height = mem.data_height;
 
410
 
 
411
                                cuda_assert(cuMemcpy2D(&param))
 
412
                        }
 
413
                        else
 
414
                                cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size))
 
415
 
 
416
                        cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT))
 
417
 
 
418
                        cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR))
 
419
                        cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES))
 
420
 
 
421
                        mem.device_pointer = (device_ptr)handle;
 
422
                }
 
423
                else {
 
424
                        cuda_pop_context();
 
425
 
 
426
                        mem_alloc(mem, MEM_READ_ONLY);
 
427
                        mem_copy_to(mem);
 
428
 
 
429
                        cuda_push_context();
 
430
 
 
431
                        cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size))
 
432
                        cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT))
 
433
                        cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER))
 
434
                }
 
435
 
 
436
                if(periodic) {
 
437
                        cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP))
 
438
                        cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP))
 
439
                }
 
440
                else {
 
441
                        cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP))
 
442
                        cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP))
 
443
                }
 
444
                cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements))
 
445
 
 
446
                cuda_pop_context();
 
447
 
 
448
                tex_interp_map[mem.device_pointer] = interpolation;
 
449
        }
 
450
 
 
451
        void tex_free(device_memory& mem)
 
452
        {
 
453
                if(mem.device_pointer) {
 
454
                        if(tex_interp_map[mem.device_pointer]) {
 
455
                                cuda_push_context();
 
456
                                cuArrayDestroy((CUarray)mem.device_pointer);
 
457
                                cuda_pop_context();
 
458
 
 
459
                                tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
 
460
                                mem.device_pointer = 0;
 
461
                        }
 
462
                        else {
 
463
                                tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
 
464
                                mem_free(mem);
 
465
                        }
 
466
                }
 
467
        }
 
468
 
 
469
        void path_trace(DeviceTask& task)
 
470
        {
 
471
                cuda_push_context();
 
472
 
 
473
                CUfunction cuPathTrace;
 
474
                CUdeviceptr d_buffer = cuda_device_ptr(task.buffer);
 
475
                CUdeviceptr d_rng_state = cuda_device_ptr(task.rng_state);
 
476
 
 
477
                /* get kernel function */
 
478
                cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"))
 
479
                
 
480
                /* pass in parameters */
 
481
                int offset = 0;
 
482
                
 
483
                cuda_assert(cuParamSetv(cuPathTrace, offset, &d_buffer, sizeof(d_buffer)))
 
484
                offset += sizeof(d_buffer);
 
485
 
 
486
                cuda_assert(cuParamSetv(cuPathTrace, offset, &d_rng_state, sizeof(d_rng_state)))
 
487
                offset += sizeof(d_rng_state);
 
488
 
 
489
                int sample = task.sample;
 
490
                offset = align_up(offset, __alignof(sample));
 
491
 
 
492
                cuda_assert(cuParamSeti(cuPathTrace, offset, task.sample))
 
493
                offset += sizeof(task.sample);
 
494
 
 
495
                cuda_assert(cuParamSeti(cuPathTrace, offset, task.x))
 
496
                offset += sizeof(task.x);
 
497
 
 
498
                cuda_assert(cuParamSeti(cuPathTrace, offset, task.y))
 
499
                offset += sizeof(task.y);
 
500
 
 
501
                cuda_assert(cuParamSeti(cuPathTrace, offset, task.w))
 
502
                offset += sizeof(task.w);
 
503
 
 
504
                cuda_assert(cuParamSeti(cuPathTrace, offset, task.h))
 
505
                offset += sizeof(task.h);
 
506
 
 
507
                cuda_assert(cuParamSeti(cuPathTrace, offset, task.offset))
 
508
                offset += sizeof(task.offset);
 
509
 
 
510
                cuda_assert(cuParamSeti(cuPathTrace, offset, task.stride))
 
511
                offset += sizeof(task.stride);
 
512
 
 
513
                cuda_assert(cuParamSetSize(cuPathTrace, offset))
 
514
 
 
515
                /* launch kernel: todo find optimal size, cache config for fermi */
 
516
#ifndef __APPLE__
 
517
                int xthreads = 16;
 
518
                int ythreads = 16;
 
519
#else
 
520
                int xthreads = 8;
 
521
                int ythreads = 8;
 
522
#endif
 
523
                int xblocks = (task.w + xthreads - 1)/xthreads;
 
524
                int yblocks = (task.h + ythreads - 1)/ythreads;
 
525
 
 
526
                cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1))
 
527
                cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1))
 
528
                cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks))
 
529
 
 
530
                cuda_pop_context();
 
531
        }
 
532
 
 
533
        void tonemap(DeviceTask& task)
 
534
        {
 
535
                cuda_push_context();
 
536
 
 
537
                CUfunction cuFilmConvert;
 
538
                CUdeviceptr d_rgba = map_pixels(task.rgba);
 
539
                CUdeviceptr d_buffer = cuda_device_ptr(task.buffer);
 
540
 
 
541
                /* get kernel function */
 
542
                cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_tonemap"))
 
543
 
 
544
                /* pass in parameters */
 
545
                int offset = 0;
 
546
 
 
547
                cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_rgba, sizeof(d_rgba)))
 
548
                offset += sizeof(d_rgba);
 
549
                
 
550
                cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_buffer, sizeof(d_buffer)))
 
551
                offset += sizeof(d_buffer);
 
552
 
 
553
                int sample = task.sample;
 
554
                offset = align_up(offset, __alignof(sample));
 
555
 
 
556
                cuda_assert(cuParamSeti(cuFilmConvert, offset, task.sample))
 
557
                offset += sizeof(task.sample);
 
558
 
 
559
                cuda_assert(cuParamSeti(cuFilmConvert, offset, task.resolution))
 
560
                offset += sizeof(task.resolution);
 
561
 
 
562
                cuda_assert(cuParamSeti(cuFilmConvert, offset, task.x))
 
563
                offset += sizeof(task.x);
 
564
 
 
565
                cuda_assert(cuParamSeti(cuFilmConvert, offset, task.y))
 
566
                offset += sizeof(task.y);
 
567
 
 
568
                cuda_assert(cuParamSeti(cuFilmConvert, offset, task.w))
 
569
                offset += sizeof(task.w);
 
570
 
 
571
                cuda_assert(cuParamSeti(cuFilmConvert, offset, task.h))
 
572
                offset += sizeof(task.h);
 
573
 
 
574
                cuda_assert(cuParamSeti(cuFilmConvert, offset, task.offset))
 
575
                offset += sizeof(task.offset);
 
576
 
 
577
                cuda_assert(cuParamSeti(cuFilmConvert, offset, task.stride))
 
578
                offset += sizeof(task.stride);
 
579
 
 
580
                cuda_assert(cuParamSetSize(cuFilmConvert, offset))
 
581
 
 
582
                /* launch kernel: todo find optimal size, cache config for fermi */
 
583
#ifndef __APPLE__
 
584
                int xthreads = 16;
 
585
                int ythreads = 16;
 
586
#else
 
587
                int xthreads = 8;
 
588
                int ythreads = 8;
 
589
#endif
 
590
                int xblocks = (task.w + xthreads - 1)/xthreads;
 
591
                int yblocks = (task.h + ythreads - 1)/ythreads;
 
592
 
 
593
                cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1))
 
594
                cuda_assert(cuFuncSetBlockShape(cuFilmConvert, xthreads, ythreads, 1))
 
595
                cuda_assert(cuLaunchGrid(cuFilmConvert, xblocks, yblocks))
 
596
 
 
597
                unmap_pixels(task.rgba);
 
598
 
 
599
                cuda_pop_context();
 
600
        }
 
601
 
 
602
        void shader(DeviceTask& task)
 
603
        {
 
604
                cuda_push_context();
 
605
 
 
606
                CUfunction cuDisplace;
 
607
                CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
 
608
                CUdeviceptr d_offset = cuda_device_ptr(task.shader_output);
 
609
 
 
610
                /* get kernel function */
 
611
                cuda_assert(cuModuleGetFunction(&cuDisplace, cuModule, "kernel_cuda_shader"))
 
612
                
 
613
                /* pass in parameters */
 
614
                int offset = 0;
 
615
                
 
616
                cuda_assert(cuParamSetv(cuDisplace, offset, &d_input, sizeof(d_input)))
 
617
                offset += sizeof(d_input);
 
618
 
 
619
                cuda_assert(cuParamSetv(cuDisplace, offset, &d_offset, sizeof(d_offset)))
 
620
                offset += sizeof(d_offset);
 
621
 
 
622
                int shader_eval_type = task.shader_eval_type;
 
623
                offset = align_up(offset, __alignof(shader_eval_type));
 
624
 
 
625
                cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_eval_type))
 
626
                offset += sizeof(task.shader_eval_type);
 
627
 
 
628
                cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_x))
 
629
                offset += sizeof(task.shader_x);
 
630
 
 
631
                cuda_assert(cuParamSetSize(cuDisplace, offset))
 
632
 
 
633
                /* launch kernel: todo find optimal size, cache config for fermi */
 
634
#ifndef __APPLE__
 
635
                int xthreads = 16;
 
636
#else
 
637
                int xthreads = 8;
 
638
#endif
 
639
                int xblocks = (task.shader_w + xthreads - 1)/xthreads;
 
640
 
 
641
                cuda_assert(cuFuncSetCacheConfig(cuDisplace, CU_FUNC_CACHE_PREFER_L1))
 
642
                cuda_assert(cuFuncSetBlockShape(cuDisplace, xthreads, 1, 1))
 
643
                cuda_assert(cuLaunchGrid(cuDisplace, xblocks, 1))
 
644
 
 
645
                cuda_pop_context();
 
646
        }
 
647
 
 
648
        CUdeviceptr map_pixels(device_ptr mem)
 
649
        {
 
650
                if(!background) {
 
651
                        PixelMem pmem = pixel_mem_map[mem];
 
652
                        CUdeviceptr buffer;
 
653
                        
 
654
                        size_t bytes;
 
655
                        cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0))
 
656
                        cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource))
 
657
                        
 
658
                        return buffer;
 
659
                }
 
660
 
 
661
                return cuda_device_ptr(mem);
 
662
        }
 
663
 
 
664
        void unmap_pixels(device_ptr mem)
 
665
        {
 
666
                if(!background) {
 
667
                        PixelMem pmem = pixel_mem_map[mem];
 
668
 
 
669
                        cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0))
 
670
                }
 
671
        }
 
672
 
 
673
        void pixels_alloc(device_memory& mem)
 
674
        {
 
675
                if(!background) {
 
676
                        PixelMem pmem;
 
677
 
 
678
                        pmem.w = mem.data_width;
 
679
                        pmem.h = mem.data_height;
 
680
 
 
681
                        cuda_push_context();
 
682
 
 
683
                        glGenBuffers(1, &pmem.cuPBO);
 
684
                        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
 
685
                        glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLfloat)*3, NULL, GL_DYNAMIC_DRAW);
 
686
                        
 
687
                        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
 
688
                        
 
689
                        glGenTextures(1, &pmem.cuTexId);
 
690
                        glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
 
691
                        glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
 
692
                        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
 
693
                        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
 
694
                        glBindTexture(GL_TEXTURE_2D, 0);
 
695
                        
 
696
                        CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
 
697
 
 
698
                        if(!cuda_error(result)) {
 
699
                                cuda_pop_context();
 
700
 
 
701
                                mem.device_pointer = pmem.cuTexId;
 
702
                                pixel_mem_map[mem.device_pointer] = pmem;
 
703
 
 
704
                                return;
 
705
                        }
 
706
                        else {
 
707
                                /* failed to register buffer, fallback to no interop */
 
708
                                glDeleteBuffers(1, &pmem.cuPBO);
 
709
                                glDeleteTextures(1, &pmem.cuTexId);
 
710
 
 
711
                                cuda_pop_context();
 
712
 
 
713
                                background = true;
 
714
                        }
 
715
                }
 
716
 
 
717
                Device::pixels_alloc(mem);
 
718
        }
 
719
 
 
720
        void pixels_copy_from(device_memory& mem, int y, int w, int h)
 
721
        {
 
722
                if(!background) {
 
723
                        PixelMem pmem = pixel_mem_map[mem.device_pointer];
 
724
 
 
725
                        cuda_push_context();
 
726
 
 
727
                        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
 
728
                        uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
 
729
                        size_t offset = sizeof(uchar)*4*y*w;
 
730
                        memcpy((uchar*)mem.data_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h);
 
731
                        glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
 
732
                        glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
 
733
 
 
734
                        cuda_pop_context();
 
735
 
 
736
                        return;
 
737
                }
 
738
 
 
739
                Device::pixels_copy_from(mem, y, w, h);
 
740
        }
 
741
 
 
742
        void pixels_free(device_memory& mem)
 
743
        {
 
744
                if(mem.device_pointer) {
 
745
                        if(!background) {
 
746
                                PixelMem pmem = pixel_mem_map[mem.device_pointer];
 
747
 
 
748
                                cuda_push_context();
 
749
 
 
750
                                cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource))
 
751
                                glDeleteBuffers(1, &pmem.cuPBO);
 
752
                                glDeleteTextures(1, &pmem.cuTexId);
 
753
 
 
754
                                cuda_pop_context();
 
755
 
 
756
                                pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
 
757
                                mem.device_pointer = 0;
 
758
 
 
759
                                return;
 
760
                        }
 
761
 
 
762
                        Device::pixels_free(mem);
 
763
                }
 
764
        }
 
765
 
 
766
        void draw_pixels(device_memory& mem, int y, int w, int h, int dy, int width, int height, bool transparent)
 
767
        {
 
768
                if(!background) {
 
769
                        PixelMem pmem = pixel_mem_map[mem.device_pointer];
 
770
 
 
771
                        cuda_push_context();
 
772
 
 
773
                        /* for multi devices, this assumes the ineffecient method that we allocate
 
774
                           all pixels on the device even though we only render to a subset */
 
775
                        size_t offset = sizeof(uint8_t)*4*y*w;
 
776
 
 
777
                        glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pmem.cuPBO);
 
778
                        glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
 
779
                        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void*)offset);
 
780
                        glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
 
781
                        
 
782
                        glEnable(GL_TEXTURE_2D);
 
783
                        
 
784
                        if(transparent) {
 
785
                                glEnable(GL_BLEND);
 
786
                                glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
 
787
                        }
 
788
 
 
789
                        glColor3f(1.0f, 1.0f, 1.0f);
 
790
 
 
791
                        glPushMatrix();
 
792
                        glTranslatef(0.0f, (float)dy, 0.0f);
 
793
                                
 
794
                        glBegin(GL_QUADS);
 
795
                        
 
796
                        glTexCoord2f(0.0f, 0.0f);
 
797
                        glVertex2f(0.0f, 0.0f);
 
798
                        glTexCoord2f((float)w/(float)pmem.w, 0.0f);
 
799
                        glVertex2f((float)width, 0.0f);
 
800
                        glTexCoord2f((float)w/(float)pmem.w, (float)h/(float)pmem.h);
 
801
                        glVertex2f((float)width, (float)height);
 
802
                        glTexCoord2f(0.0f, (float)h/(float)pmem.h);
 
803
                        glVertex2f(0.0f, (float)height);
 
804
 
 
805
                        glEnd();
 
806
 
 
807
                        glPopMatrix();
 
808
 
 
809
                        if(transparent)
 
810
                                glDisable(GL_BLEND);
 
811
                        
 
812
                        glBindTexture(GL_TEXTURE_2D, 0);
 
813
                        glDisable(GL_TEXTURE_2D);
 
814
 
 
815
                        cuda_pop_context();
 
816
 
 
817
                        return;
 
818
                }
 
819
 
 
820
                Device::draw_pixels(mem, y, w, h, dy, width, height, transparent);
 
821
        }
 
822
 
 
823
        void task_add(DeviceTask& task)
 
824
        {
 
825
                if(task.type == DeviceTask::TONEMAP)
 
826
                        tonemap(task);
 
827
                else if(task.type == DeviceTask::PATH_TRACE)
 
828
                        path_trace(task);
 
829
                else if(task.type == DeviceTask::SHADER)
 
830
                        shader(task);
 
831
        }
 
832
 
 
833
        void task_wait()
 
834
        {
 
835
                cuda_push_context();
 
836
 
 
837
                cuda_assert(cuCtxSynchronize())
 
838
 
 
839
                cuda_pop_context();
 
840
        }
 
841
 
 
842
        void task_cancel()
 
843
        {
 
844
        }
 
845
};
 
846
 
 
847
Device *device_cuda_create(DeviceInfo& info, bool background)
 
848
{
 
849
        return new CUDADevice(info, background);
 
850
}
 
851
 
 
852
void device_cuda_info(vector<DeviceInfo>& devices)
 
853
{
 
854
        int count = 0;
 
855
 
 
856
        if(cuInit(0) != CUDA_SUCCESS)
 
857
                return;
 
858
        if(cuDeviceGetCount(&count) != CUDA_SUCCESS)
 
859
                return;
 
860
        
 
861
        vector<DeviceInfo> display_devices;
 
862
        
 
863
        for(int num = 0; num < count; num++) {
 
864
                char name[256];
 
865
                int attr;
 
866
                
 
867
                if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS)
 
868
                        continue;
 
869
 
 
870
                DeviceInfo info;
 
871
 
 
872
                info.type = DEVICE_CUDA;
 
873
                info.description = string(name);
 
874
                info.id = string_printf("CUDA_%d", num);
 
875
                info.num = num;
 
876
 
 
877
                int major, minor;
 
878
                cuDeviceComputeCapability(&major, &minor, num);
 
879
                info.advanced_shading = (major >= 2);
 
880
 
 
881
                /* if device has a kernel timeout, assume it is used for display */
 
882
                if(cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num) == CUDA_SUCCESS && attr == 1) {
 
883
                        info.display_device = true;
 
884
                        display_devices.push_back(info);
 
885
                }
 
886
                else
 
887
                        devices.push_back(info);
 
888
        }
 
889
 
 
890
        if(!display_devices.empty())
 
891
                devices.insert(devices.end(), display_devices.begin(), display_devices.end());
 
892
}
 
893
 
 
894
CCL_NAMESPACE_END
 
895