2
* Copyright 2011, Blender Foundation.
4
* This program is free software; you can redistribute it and/or
5
* modify it under the terms of the GNU General Public License
6
* as published by the Free Software Foundation; either version 2
7
* of the License, or (at your option) any later version.
9
* This program is distributed in the hope that it will be useful,
10
* but WITHOUT ANY WARRANTY; without even the implied warranty of
11
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
* GNU General Public License for more details.
14
* You should have received a copy of the GNU General Public License
15
* along with this program; if not, write to the Free Software Foundation,
16
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
24
#include "device_intern.h"
26
#include "util_cuda.h"
27
#include "util_debug.h"
29
#include "util_opengl.h"
30
#include "util_path.h"
31
#include "util_system.h"
32
#include "util_types.h"
33
#include "util_time.h"
37
class CUDADevice : public Device
43
map<device_ptr, bool> tex_interp_map;
48
CUgraphicsResource cuPBOresource;
53
map<device_ptr, PixelMem> pixel_mem_map;
55
CUdeviceptr cuda_device_ptr(device_ptr mem)
57
return (CUdeviceptr)mem;
60
const char *cuda_error_string(CUresult result)
63
case CUDA_SUCCESS: return "No errors";
64
case CUDA_ERROR_INVALID_VALUE: return "Invalid value";
65
case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory";
66
case CUDA_ERROR_NOT_INITIALIZED: return "Driver not initialized";
67
case CUDA_ERROR_DEINITIALIZED: return "Driver deinitialized";
69
case CUDA_ERROR_NO_DEVICE: return "No CUDA-capable device available";
70
case CUDA_ERROR_INVALID_DEVICE: return "Invalid device";
72
case CUDA_ERROR_INVALID_IMAGE: return "Invalid kernel image";
73
case CUDA_ERROR_INVALID_CONTEXT: return "Invalid context";
74
case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "Context already current";
75
case CUDA_ERROR_MAP_FAILED: return "Map failed";
76
case CUDA_ERROR_UNMAP_FAILED: return "Unmap failed";
77
case CUDA_ERROR_ARRAY_IS_MAPPED: return "Array is mapped";
78
case CUDA_ERROR_ALREADY_MAPPED: return "Already mapped";
79
case CUDA_ERROR_NO_BINARY_FOR_GPU: return "No binary for GPU";
80
case CUDA_ERROR_ALREADY_ACQUIRED: return "Already acquired";
81
case CUDA_ERROR_NOT_MAPPED: return "Not mapped";
82
case CUDA_ERROR_NOT_MAPPED_AS_ARRAY: return "Mapped resource not available for access as an array";
83
case CUDA_ERROR_NOT_MAPPED_AS_POINTER: return "Mapped resource not available for access as a pointer";
84
case CUDA_ERROR_ECC_UNCORRECTABLE: return "Uncorrectable ECC error detected";
85
case CUDA_ERROR_UNSUPPORTED_LIMIT: return "CUlimit not supported by device";
87
case CUDA_ERROR_INVALID_SOURCE: return "Invalid source";
88
case CUDA_ERROR_FILE_NOT_FOUND: return "File not found";
89
case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND: return "Link to a shared object failed to resolve";
90
case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED: return "Shared object initialization failed";
92
case CUDA_ERROR_INVALID_HANDLE: return "Invalid handle";
94
case CUDA_ERROR_NOT_FOUND: return "Not found";
96
case CUDA_ERROR_NOT_READY: return "CUDA not ready";
98
case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed";
99
case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "Launch exceeded resources";
100
case CUDA_ERROR_LAUNCH_TIMEOUT: return "Launch exceeded timeout";
101
case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "Launch with incompatible texturing";
103
case CUDA_ERROR_UNKNOWN: return "Unknown error";
105
default: return "Unknown CUDA error value";
112
#define cuda_abort() abort()
115
#define cuda_assert(stmt) \
117
CUresult result = stmt; \
119
if(result != CUDA_SUCCESS) { \
120
string message = string_printf("CUDA error: %s in %s", cuda_error_string(result), #stmt); \
121
if(error_msg == "") \
122
error_msg = message; \
123
fprintf(stderr, "%s\n", message.c_str()); \
128
bool cuda_error(CUresult result)
130
if(result == CUDA_SUCCESS)
133
string message = string_printf("CUDA error: %s", cuda_error_string(result));
136
fprintf(stderr, "%s\n", message.c_str());
140
void cuda_error(const string& message)
144
fprintf(stderr, "%s\n", message.c_str());
147
void cuda_push_context()
149
cuda_assert(cuCtxSetCurrent(cuContext))
152
void cuda_pop_context()
154
cuda_assert(cuCtxSetCurrent(NULL));
157
CUDADevice(DeviceInfo& info, bool background_)
159
background = background_;
166
if(cuda_error(cuInit(0)))
169
/* setup device and context */
170
if(cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
176
result = cuCtxCreate(&cuContext, 0, cuDevice);
179
result = cuGLCtxCreate(&cuContext, 0, cuDevice);
181
if(result != CUDA_SUCCESS) {
182
result = cuCtxCreate(&cuContext, 0, cuDevice);
187
if(cuda_error(result))
196
cuda_assert(cuCtxDetach(cuContext))
199
bool support_device(bool experimental)
203
cuDeviceComputeCapability(&major, &minor, cuDevId);
205
if(major <= 1 && minor <= 2) {
206
cuda_error(string_printf("CUDA device supported only with compute capability 1.3 or up, found %d.%d.", major, minor));
214
string compile_kernel()
216
/* compute cubin name */
218
cuDeviceComputeCapability(&major, &minor, cuDevId);
220
/* attempt to use kernel provided with blender */
221
string cubin = path_get(string_printf("lib/kernel_sm_%d%d.cubin", major, minor));
222
if(path_exists(cubin))
225
/* not found, try to use locally compiled kernel */
226
string kernel_path = path_get("kernel");
227
string md5 = path_files_md5_hash(kernel_path);
229
cubin = string_printf("cycles_kernel_sm%d%d_%s.cubin", major, minor, md5.c_str());;
230
cubin = path_user_get(path_join("cache", cubin));
232
/* if exists already, use it */
233
if(path_exists(cubin))
236
#if defined(WITH_CUDA_BINARIES) && defined(_WIN32)
237
if(major <= 1 && minor <= 2)
238
cuda_error(string_printf("CUDA device supported only compute capability 1.3 or up, found %d.%d.", major, minor));
240
cuda_error(string_printf("CUDA binary kernel for this graphics card compute capability (%d.%d) not found.", major, minor));
243
/* if not, find CUDA compiler */
244
string nvcc = cuCompilerPath();
247
cuda_error("CUDA nvcc compiler not found. Install CUDA toolkit in default location.");
252
string kernel = path_join(kernel_path, "kernel.cu");
253
string include = kernel_path;
254
const int machine = system_cpu_bits();
255
const int maxreg = 24;
257
double starttime = time_dt();
258
printf("Compiling CUDA kernel ...\n");
260
path_create_directories(cubin);
262
string command = string_printf("\"%s\" -arch=sm_%d%d -m%d --cubin \"%s\" --use_fast_math "
263
"-o \"%s\" --ptxas-options=\"-v\" --maxrregcount=%d --opencc-options -OPT:Olimit=0 -I\"%s\" -DNVCC",
264
nvcc.c_str(), major, minor, machine, kernel.c_str(), cubin.c_str(), maxreg, include.c_str());
266
if(system(command.c_str()) == -1) {
267
cuda_error("Failed to execute compilation command, see console for details.");
271
/* verify if compilation succeeded */
272
if(!path_exists(cubin)) {
273
cuda_error("CUDA kernel compilation failed, see console for details.");
277
printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
283
bool load_kernels(bool experimental)
285
/* check if cuda init succeeded */
289
if(!support_device(experimental))
293
string cubin = compile_kernel();
301
CUresult result = cuModuleLoad(&cuModule, cubin.c_str());
302
if(cuda_error(result))
303
cuda_error(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
307
return (result == CUDA_SUCCESS);
310
void mem_alloc(device_memory& mem, MemoryType type)
313
CUdeviceptr device_pointer;
314
cuda_assert(cuMemAlloc(&device_pointer, mem.memory_size()))
315
mem.device_pointer = (device_ptr)device_pointer;
319
void mem_copy_to(device_memory& mem)
322
cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer), (void*)mem.data_pointer, mem.memory_size()))
326
void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
328
size_t offset = elem*y*w;
329
size_t size = elem*w*h;
332
cuda_assert(cuMemcpyDtoH((uchar*)mem.data_pointer + offset,
333
(CUdeviceptr)((uchar*)mem.device_pointer + offset), size))
337
void mem_zero(device_memory& mem)
339
memset((void*)mem.data_pointer, 0, mem.memory_size());
342
cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()))
346
void mem_free(device_memory& mem)
348
if(mem.device_pointer) {
350
cuda_assert(cuMemFree(cuda_device_ptr(mem.device_pointer)))
353
mem.device_pointer = 0;
357
void const_copy_to(const char *name, void *host, size_t size)
363
cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name))
364
//assert(bytes == size);
365
cuda_assert(cuMemcpyHtoD(mem, host, size))
369
void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
371
/* determine format */
372
CUarray_format_enum format;
373
size_t dsize = datatype_size(mem.data_type);
374
size_t size = mem.memory_size();
376
switch(mem.data_type) {
377
case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
378
case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
379
case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
380
case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
381
default: assert(0); return;
387
cuda_assert(cuModuleGetTexRef(&texref, cuModule, name))
391
CUDA_ARRAY_DESCRIPTOR desc;
393
desc.Width = mem.data_width;
394
desc.Height = mem.data_height;
395
desc.Format = format;
396
desc.NumChannels = mem.data_elements;
398
cuda_assert(cuArrayCreate(&handle, &desc))
400
if(mem.data_height > 1) {
402
memset(¶m, 0, sizeof(param));
403
param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
404
param.dstArray = handle;
405
param.srcMemoryType = CU_MEMORYTYPE_HOST;
406
param.srcHost = (void*)mem.data_pointer;
407
param.srcPitch = mem.data_width*dsize*mem.data_elements;
408
param.WidthInBytes = param.srcPitch;
409
param.Height = mem.data_height;
411
cuda_assert(cuMemcpy2D(¶m))
414
cuda_assert(cuMemcpyHtoA(handle, 0, (void*)mem.data_pointer, size))
416
cuda_assert(cuTexRefSetArray(texref, handle, CU_TRSA_OVERRIDE_FORMAT))
418
cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR))
419
cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES))
421
mem.device_pointer = (device_ptr)handle;
426
mem_alloc(mem, MEM_READ_ONLY);
431
cuda_assert(cuTexRefSetAddress(NULL, texref, cuda_device_ptr(mem.device_pointer), size))
432
cuda_assert(cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_POINT))
433
cuda_assert(cuTexRefSetFlags(texref, CU_TRSF_READ_AS_INTEGER))
437
cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_WRAP))
438
cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_WRAP))
441
cuda_assert(cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP))
442
cuda_assert(cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP))
444
cuda_assert(cuTexRefSetFormat(texref, format, mem.data_elements))
448
tex_interp_map[mem.device_pointer] = interpolation;
451
void tex_free(device_memory& mem)
453
if(mem.device_pointer) {
454
if(tex_interp_map[mem.device_pointer]) {
456
cuArrayDestroy((CUarray)mem.device_pointer);
459
tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
460
mem.device_pointer = 0;
463
tex_interp_map.erase(tex_interp_map.find(mem.device_pointer));
469
void path_trace(DeviceTask& task)
473
CUfunction cuPathTrace;
474
CUdeviceptr d_buffer = cuda_device_ptr(task.buffer);
475
CUdeviceptr d_rng_state = cuda_device_ptr(task.rng_state);
477
/* get kernel function */
478
cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"))
480
/* pass in parameters */
483
cuda_assert(cuParamSetv(cuPathTrace, offset, &d_buffer, sizeof(d_buffer)))
484
offset += sizeof(d_buffer);
486
cuda_assert(cuParamSetv(cuPathTrace, offset, &d_rng_state, sizeof(d_rng_state)))
487
offset += sizeof(d_rng_state);
489
int sample = task.sample;
490
offset = align_up(offset, __alignof(sample));
492
cuda_assert(cuParamSeti(cuPathTrace, offset, task.sample))
493
offset += sizeof(task.sample);
495
cuda_assert(cuParamSeti(cuPathTrace, offset, task.x))
496
offset += sizeof(task.x);
498
cuda_assert(cuParamSeti(cuPathTrace, offset, task.y))
499
offset += sizeof(task.y);
501
cuda_assert(cuParamSeti(cuPathTrace, offset, task.w))
502
offset += sizeof(task.w);
504
cuda_assert(cuParamSeti(cuPathTrace, offset, task.h))
505
offset += sizeof(task.h);
507
cuda_assert(cuParamSeti(cuPathTrace, offset, task.offset))
508
offset += sizeof(task.offset);
510
cuda_assert(cuParamSeti(cuPathTrace, offset, task.stride))
511
offset += sizeof(task.stride);
513
cuda_assert(cuParamSetSize(cuPathTrace, offset))
515
/* launch kernel: todo find optimal size, cache config for fermi */
523
int xblocks = (task.w + xthreads - 1)/xthreads;
524
int yblocks = (task.h + ythreads - 1)/ythreads;
526
cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1))
527
cuda_assert(cuFuncSetBlockShape(cuPathTrace, xthreads, ythreads, 1))
528
cuda_assert(cuLaunchGrid(cuPathTrace, xblocks, yblocks))
533
void tonemap(DeviceTask& task)
537
CUfunction cuFilmConvert;
538
CUdeviceptr d_rgba = map_pixels(task.rgba);
539
CUdeviceptr d_buffer = cuda_device_ptr(task.buffer);
541
/* get kernel function */
542
cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_tonemap"))
544
/* pass in parameters */
547
cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_rgba, sizeof(d_rgba)))
548
offset += sizeof(d_rgba);
550
cuda_assert(cuParamSetv(cuFilmConvert, offset, &d_buffer, sizeof(d_buffer)))
551
offset += sizeof(d_buffer);
553
int sample = task.sample;
554
offset = align_up(offset, __alignof(sample));
556
cuda_assert(cuParamSeti(cuFilmConvert, offset, task.sample))
557
offset += sizeof(task.sample);
559
cuda_assert(cuParamSeti(cuFilmConvert, offset, task.resolution))
560
offset += sizeof(task.resolution);
562
cuda_assert(cuParamSeti(cuFilmConvert, offset, task.x))
563
offset += sizeof(task.x);
565
cuda_assert(cuParamSeti(cuFilmConvert, offset, task.y))
566
offset += sizeof(task.y);
568
cuda_assert(cuParamSeti(cuFilmConvert, offset, task.w))
569
offset += sizeof(task.w);
571
cuda_assert(cuParamSeti(cuFilmConvert, offset, task.h))
572
offset += sizeof(task.h);
574
cuda_assert(cuParamSeti(cuFilmConvert, offset, task.offset))
575
offset += sizeof(task.offset);
577
cuda_assert(cuParamSeti(cuFilmConvert, offset, task.stride))
578
offset += sizeof(task.stride);
580
cuda_assert(cuParamSetSize(cuFilmConvert, offset))
582
/* launch kernel: todo find optimal size, cache config for fermi */
590
int xblocks = (task.w + xthreads - 1)/xthreads;
591
int yblocks = (task.h + ythreads - 1)/ythreads;
593
cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1))
594
cuda_assert(cuFuncSetBlockShape(cuFilmConvert, xthreads, ythreads, 1))
595
cuda_assert(cuLaunchGrid(cuFilmConvert, xblocks, yblocks))
597
unmap_pixels(task.rgba);
602
void shader(DeviceTask& task)
606
CUfunction cuDisplace;
607
CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
608
CUdeviceptr d_offset = cuda_device_ptr(task.shader_output);
610
/* get kernel function */
611
cuda_assert(cuModuleGetFunction(&cuDisplace, cuModule, "kernel_cuda_shader"))
613
/* pass in parameters */
616
cuda_assert(cuParamSetv(cuDisplace, offset, &d_input, sizeof(d_input)))
617
offset += sizeof(d_input);
619
cuda_assert(cuParamSetv(cuDisplace, offset, &d_offset, sizeof(d_offset)))
620
offset += sizeof(d_offset);
622
int shader_eval_type = task.shader_eval_type;
623
offset = align_up(offset, __alignof(shader_eval_type));
625
cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_eval_type))
626
offset += sizeof(task.shader_eval_type);
628
cuda_assert(cuParamSeti(cuDisplace, offset, task.shader_x))
629
offset += sizeof(task.shader_x);
631
cuda_assert(cuParamSetSize(cuDisplace, offset))
633
/* launch kernel: todo find optimal size, cache config for fermi */
639
int xblocks = (task.shader_w + xthreads - 1)/xthreads;
641
cuda_assert(cuFuncSetCacheConfig(cuDisplace, CU_FUNC_CACHE_PREFER_L1))
642
cuda_assert(cuFuncSetBlockShape(cuDisplace, xthreads, 1, 1))
643
cuda_assert(cuLaunchGrid(cuDisplace, xblocks, 1))
648
CUdeviceptr map_pixels(device_ptr mem)
651
PixelMem pmem = pixel_mem_map[mem];
655
cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0))
656
cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource))
661
return cuda_device_ptr(mem);
664
void unmap_pixels(device_ptr mem)
667
PixelMem pmem = pixel_mem_map[mem];
669
cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0))
673
void pixels_alloc(device_memory& mem)
678
pmem.w = mem.data_width;
679
pmem.h = mem.data_height;
683
glGenBuffers(1, &pmem.cuPBO);
684
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
685
glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLfloat)*3, NULL, GL_DYNAMIC_DRAW);
687
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
689
glGenTextures(1, &pmem.cuTexId);
690
glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
691
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
692
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
693
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
694
glBindTexture(GL_TEXTURE_2D, 0);
696
CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
698
if(!cuda_error(result)) {
701
mem.device_pointer = pmem.cuTexId;
702
pixel_mem_map[mem.device_pointer] = pmem;
707
/* failed to register buffer, fallback to no interop */
708
glDeleteBuffers(1, &pmem.cuPBO);
709
glDeleteTextures(1, &pmem.cuTexId);
717
Device::pixels_alloc(mem);
720
void pixels_copy_from(device_memory& mem, int y, int w, int h)
723
PixelMem pmem = pixel_mem_map[mem.device_pointer];
727
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
728
uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
729
size_t offset = sizeof(uchar)*4*y*w;
730
memcpy((uchar*)mem.data_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h);
731
glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
732
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
739
Device::pixels_copy_from(mem, y, w, h);
742
void pixels_free(device_memory& mem)
744
if(mem.device_pointer) {
746
PixelMem pmem = pixel_mem_map[mem.device_pointer];
750
cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource))
751
glDeleteBuffers(1, &pmem.cuPBO);
752
glDeleteTextures(1, &pmem.cuTexId);
756
pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
757
mem.device_pointer = 0;
762
Device::pixels_free(mem);
766
void draw_pixels(device_memory& mem, int y, int w, int h, int dy, int width, int height, bool transparent)
769
PixelMem pmem = pixel_mem_map[mem.device_pointer];
773
/* for multi devices, this assumes the ineffecient method that we allocate
774
all pixels on the device even though we only render to a subset */
775
size_t offset = sizeof(uint8_t)*4*y*w;
777
glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, pmem.cuPBO);
778
glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
779
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void*)offset);
780
glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
782
glEnable(GL_TEXTURE_2D);
786
glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
789
glColor3f(1.0f, 1.0f, 1.0f);
792
glTranslatef(0.0f, (float)dy, 0.0f);
796
glTexCoord2f(0.0f, 0.0f);
797
glVertex2f(0.0f, 0.0f);
798
glTexCoord2f((float)w/(float)pmem.w, 0.0f);
799
glVertex2f((float)width, 0.0f);
800
glTexCoord2f((float)w/(float)pmem.w, (float)h/(float)pmem.h);
801
glVertex2f((float)width, (float)height);
802
glTexCoord2f(0.0f, (float)h/(float)pmem.h);
803
glVertex2f(0.0f, (float)height);
812
glBindTexture(GL_TEXTURE_2D, 0);
813
glDisable(GL_TEXTURE_2D);
820
Device::draw_pixels(mem, y, w, h, dy, width, height, transparent);
823
void task_add(DeviceTask& task)
825
if(task.type == DeviceTask::TONEMAP)
827
else if(task.type == DeviceTask::PATH_TRACE)
829
else if(task.type == DeviceTask::SHADER)
837
cuda_assert(cuCtxSynchronize())
847
Device *device_cuda_create(DeviceInfo& info, bool background)
849
return new CUDADevice(info, background);
852
void device_cuda_info(vector<DeviceInfo>& devices)
856
if(cuInit(0) != CUDA_SUCCESS)
858
if(cuDeviceGetCount(&count) != CUDA_SUCCESS)
861
vector<DeviceInfo> display_devices;
863
for(int num = 0; num < count; num++) {
867
if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS)
872
info.type = DEVICE_CUDA;
873
info.description = string(name);
874
info.id = string_printf("CUDA_%d", num);
878
cuDeviceComputeCapability(&major, &minor, num);
879
info.advanced_shading = (major >= 2);
881
/* if device has a kernel timeout, assume it is used for display */
882
if(cuDeviceGetAttribute(&attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num) == CUDA_SUCCESS && attr == 1) {
883
info.display_device = true;
884
display_devices.push_back(info);
887
devices.push_back(info);
890
if(!display_devices.empty())
891
devices.insert(devices.end(), display_devices.begin(), display_devices.end());