100
97
void mem_free(device_memory& mem)
102
99
mem.device_pointer = 0;
101
stats.mem_free(mem.memory_size());
105
104
void const_copy_to(const char *name, void *host, size_t size)
107
kernel_const_copy(kg, name, host, size);
106
kernel_const_copy(&kernel_globals, name, host, size);
110
109
void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
112
kernel_tex_copy(kg, name, mem.data_pointer, mem.data_width, mem.data_height);
111
kernel_tex_copy(&kernel_globals, name, mem.data_pointer, mem.data_width, mem.data_height);
113
112
mem.device_pointer = mem.data_pointer;
114
stats.mem_alloc(mem.memory_size());
116
117
void tex_free(device_memory& mem)
118
119
mem.device_pointer = 0;
121
stats.mem_free(mem.memory_size());
121
124
void *osl_memory()
124
return kernel_osl_memory(kg);
130
void thread_run(int t)
133
void thread_run(DeviceTask *task)
134
while(tasks.worker_wait_pop(task)) {
135
if(task.type == DeviceTask::PATH_TRACE)
136
thread_path_trace(task);
137
else if(task.type == DeviceTask::TONEMAP)
138
thread_tonemap(task);
139
else if(task.type == DeviceTask::SHADER)
135
if(task->type == DeviceTask::PATH_TRACE)
136
thread_path_trace(*task);
137
else if(task->type == DeviceTask::TONEMAP)
138
thread_tonemap(*task);
139
else if(task->type == DeviceTask::SHADER)
140
thread_shader(*task);
143
class CPUDeviceTask : public DeviceTask {
145
CPUDeviceTask(CPUDevice *device, DeviceTask& task)
148
run = function_bind(&CPUDevice::thread_run, device, this);
146
152
void thread_path_trace(DeviceTask& task)
148
if(tasks.worker_cancel())
154
if(task_pool.cancelled()) {
155
if(task.need_finish_queue == false)
159
KernelGlobals kg = kernel_globals;
152
if(kernel_osl_use(kg))
153
OSLShader::thread_init(kg);
162
OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
167
while(task.acquire_tile(this, tile)) {
168
float *render_buffer = (float*)tile.buffer;
169
uint *rng_state = (uint*)tile.rng_state;
170
int start_sample = tile.start_sample;
171
int end_sample = tile.start_sample + tile.num_samples;
156
173
#ifdef WITH_OPTIMIZED_KERNEL
157
if(system_cpu_support_optimized()) {
158
for(int y = task.y; y < task.y + task.h; y++) {
159
for(int x = task.x; x < task.x + task.w; x++)
160
kernel_cpu_optimized_path_trace(kg, (float*)task.buffer, (unsigned int*)task.rng_state,
161
task.sample, x, y, task.offset, task.stride);
163
if(tasks.worker_cancel())
174
if(system_cpu_support_sse3()) {
175
for(int sample = start_sample; sample < end_sample; sample++) {
176
if (task.get_cancel() || task_pool.cancelled()) {
177
if(task.need_finish_queue == false)
181
for(int y = tile.y; y < tile.y + tile.h; y++) {
182
for(int x = tile.x; x < tile.x + tile.w; x++) {
183
kernel_cpu_sse3_path_trace(&kg, render_buffer, rng_state,
184
sample, x, y, tile.offset, tile.stride);
188
tile.sample = sample + 1;
190
task.update_progress(tile);
193
else if(system_cpu_support_sse2()) {
194
for(int sample = start_sample; sample < end_sample; sample++) {
195
if (task.get_cancel() || task_pool.cancelled()) {
196
if(task.need_finish_queue == false)
200
for(int y = tile.y; y < tile.y + tile.h; y++) {
201
for(int x = tile.x; x < tile.x + tile.w; x++) {
202
kernel_cpu_sse2_path_trace(&kg, render_buffer, rng_state,
203
sample, x, y, tile.offset, tile.stride);
207
tile.sample = sample + 1;
209
task.update_progress(tile);
170
for(int y = task.y; y < task.y + task.h; y++) {
171
for(int x = task.x; x < task.x + task.w; x++)
172
kernel_cpu_path_trace(kg, (float*)task.buffer, (unsigned int*)task.rng_state,
173
task.sample, x, y, task.offset, task.stride);
175
if(tasks.worker_cancel())
215
for(int sample = start_sample; sample < end_sample; sample++) {
216
if (task.get_cancel() || task_pool.cancelled()) {
217
if(task.need_finish_queue == false)
221
for(int y = tile.y; y < tile.y + tile.h; y++) {
222
for(int x = tile.x; x < tile.x + tile.w; x++) {
223
kernel_cpu_path_trace(&kg, render_buffer, rng_state,
224
sample, x, y, tile.offset, tile.stride);
228
tile.sample = sample + 1;
230
task.update_progress(tile);
234
task.release_tile(tile);
236
if(task_pool.cancelled()) {
237
if(task.need_finish_queue == false)
181
if(kernel_osl_use(kg))
182
OSLShader::thread_free(kg);
243
OSLShader::thread_free(&kg);
186
247
void thread_tonemap(DeviceTask& task)
188
249
#ifdef WITH_OPTIMIZED_KERNEL
189
if(system_cpu_support_optimized()) {
190
for(int y = task.y; y < task.y + task.h; y++)
191
for(int x = task.x; x < task.x + task.w; x++)
192
kernel_cpu_optimized_tonemap(kg, (uchar4*)task.rgba, (float*)task.buffer,
250
if(system_cpu_support_sse3()) {
251
for(int y = task.y; y < task.y + task.h; y++)
252
for(int x = task.x; x < task.x + task.w; x++)
253
kernel_cpu_sse3_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
254
task.sample, task.resolution, x, y, task.offset, task.stride);
256
else if(system_cpu_support_sse2()) {
257
for(int y = task.y; y < task.y + task.h; y++)
258
for(int x = task.x; x < task.x + task.w; x++)
259
kernel_cpu_sse2_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
193
260
task.sample, task.resolution, x, y, task.offset, task.stride);
198
265
for(int y = task.y; y < task.y + task.h; y++)
199
266
for(int x = task.x; x < task.x + task.w; x++)
200
kernel_cpu_tonemap(kg, (uchar4*)task.rgba, (float*)task.buffer,
267
kernel_cpu_tonemap(&kernel_globals, (uchar4*)task.rgba, (float*)task.buffer,
201
268
task.sample, task.resolution, x, y, task.offset, task.stride);
205
272
void thread_shader(DeviceTask& task)
274
KernelGlobals kg = kernel_globals;
208
if(kernel_osl_use(kg))
209
OSLShader::thread_init(kg);
277
OSLShader::thread_init(&kg, &kernel_globals, &osl_globals);
212
280
#ifdef WITH_OPTIMIZED_KERNEL
213
if(system_cpu_support_optimized()) {
214
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
215
kernel_cpu_optimized_shader(kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
217
if(tasks.worker_cancel())
281
if(system_cpu_support_sse3()) {
282
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
283
kernel_cpu_sse3_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
285
if(task_pool.cancelled())
289
else if(system_cpu_support_sse2()) {
290
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
291
kernel_cpu_sse2_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
293
if(task_pool.cancelled())
224
300
for(int x = task.shader_x; x < task.shader_x + task.shader_w; x++) {
225
kernel_cpu_shader(kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
301
kernel_cpu_shader(&kg, (uint4*)task.shader_input, (float4*)task.shader_output, task.shader_eval_type, x);
227
if(tasks.worker_cancel())
303
if(task_pool.cancelled())
233
if(kernel_osl_use(kg))
234
OSLShader::thread_free(kg);
309
OSLShader::thread_free(&kg);
238
313
void task_add(DeviceTask& task)
240
315
/* split task into smaller ones, more than number of threads for uneven
241
workloads where some parts of the image render slower than others */
242
task.split(tasks, threads.size()*10);
316
* workloads where some parts of the image render slower than others */
317
list<DeviceTask> tasks;
318
task.split(tasks, TaskScheduler::num_threads());
320
foreach(DeviceTask& task, tasks)
321
task_pool.push(new CPUDeviceTask(this, task));
326
task_pool.wait_work();
250
329
void task_cancel()
256
Device *device_cpu_create(DeviceInfo& info, int threads)
335
Device *device_cpu_create(DeviceInfo& info, Stats &stats)
258
return new CPUDevice(threads);
337
return new CPUDevice(stats);
261
340
void device_cpu_info(vector<DeviceInfo>& devices)