2
* Copyright 2008-2011 NVIDIA Corporation
4
* Licensed under the Apache License, Version 2.0 (the "License");
5
* you may not use this file except in compliance with the License.
6
* You may obtain a copy of the License at
8
* http://www.apache.org/licenses/LICENSE-2.0
10
* Unless required by applicable law or agreed to in writing, software
11
* distributed under the License is distributed on an "AS IS" BASIS,
12
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
* See the License for the specific language governing permissions and
14
* limitations under the License.
19
* \brief A robust scan for general types.
24
#include <thrust/detail/config.h>
26
// do not attempt to compile this file with any other compiler
27
#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
29
#include <thrust/iterator/iterator_traits.h>
31
#include <thrust/detail/util/blocking.h>
33
#include <thrust/detail/raw_buffer.h>
34
#include <thrust/detail/device/dereference.h>
36
#include <thrust/detail/device/cuda/extern_shared_ptr.h>
37
#include <thrust/detail/device/cuda/synchronize.h>
39
// to configure launch parameters
40
#include <thrust/detail/device/cuda/arch.h>
43
__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
51
// forward declaration of raw_cuda_device_buffer
52
template<typename> class raw_cuda_device_buffer;
64
template <typename SharedArray,
66
typename BinaryFunction>
68
T scan_block(SharedArray array, T val, BinaryFunction binary_op)
70
array[threadIdx.x] = val;
74
// copy to temporary so val and tmp have the same memory space
75
if (blockDim.x > 1) { if(threadIdx.x >= 1) { T tmp = array[threadIdx.x - 1]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
76
if (blockDim.x > 2) { if(threadIdx.x >= 2) { T tmp = array[threadIdx.x - 2]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
77
if (blockDim.x > 4) { if(threadIdx.x >= 4) { T tmp = array[threadIdx.x - 4]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
78
if (blockDim.x > 8) { if(threadIdx.x >= 8) { T tmp = array[threadIdx.x - 8]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
79
if (blockDim.x > 16) { if(threadIdx.x >= 16) { T tmp = array[threadIdx.x - 16]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
80
if (blockDim.x > 32) { if(threadIdx.x >= 32) { T tmp = array[threadIdx.x - 32]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
81
if (blockDim.x > 64) { if(threadIdx.x >= 64) { T tmp = array[threadIdx.x - 64]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
82
if (blockDim.x > 128) { if(threadIdx.x >= 128) { T tmp = array[threadIdx.x - 128]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
83
if (blockDim.x > 256) { if(threadIdx.x >= 256) { T tmp = array[threadIdx.x - 256]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
84
if (blockDim.x > 512) { if(threadIdx.x >= 512) { T tmp = array[threadIdx.x - 512]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
89
template <typename SharedArray,
91
typename BinaryFunction>
93
T scan_block_n(SharedArray array, const unsigned int n, T val, BinaryFunction binary_op)
95
array[threadIdx.x] = val;
99
if (blockDim.x > 1) { if(threadIdx.x < n && threadIdx.x >= 1) { T tmp = array[threadIdx.x - 1]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
100
if (blockDim.x > 2) { if(threadIdx.x < n && threadIdx.x >= 2) { T tmp = array[threadIdx.x - 2]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
101
if (blockDim.x > 4) { if(threadIdx.x < n && threadIdx.x >= 4) { T tmp = array[threadIdx.x - 4]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
102
if (blockDim.x > 8) { if(threadIdx.x < n && threadIdx.x >= 8) { T tmp = array[threadIdx.x - 8]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
103
if (blockDim.x > 16) { if(threadIdx.x < n && threadIdx.x >= 16) { T tmp = array[threadIdx.x - 16]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
104
if (blockDim.x > 32) { if(threadIdx.x < n && threadIdx.x >= 32) { T tmp = array[threadIdx.x - 32]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
105
if (blockDim.x > 64) { if(threadIdx.x < n && threadIdx.x >= 64) { T tmp = array[threadIdx.x - 64]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
106
if (blockDim.x > 128) { if(threadIdx.x < n && threadIdx.x >= 128) { T tmp = array[threadIdx.x - 128]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
107
if (blockDim.x > 256) { if(threadIdx.x < n && threadIdx.x >= 256) { T tmp = array[threadIdx.x - 256]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
108
if (blockDim.x > 512) { if(threadIdx.x < n && threadIdx.x >= 512) { T tmp = array[threadIdx.x - 512]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
113
template <typename InputIterator,
114
typename OutputIterator,
115
typename BinaryFunction>
117
void scan_intervals(InputIterator input,
118
const unsigned int N,
119
const unsigned int interval_size,
120
OutputIterator output,
121
typename thrust::iterator_value<OutputIterator>::type * block_results,
122
BinaryFunction binary_op)
124
typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
126
thrust::detail::device::cuda::extern_shared_ptr<OutputType> sdata;
128
const unsigned int interval_begin = interval_size * blockIdx.x;
129
const unsigned int interval_end = min(interval_begin + interval_size, N);
131
unsigned int base = interval_begin;
135
// process full blocks
136
for(; base + blockDim.x <= interval_end; base += blockDim.x)
140
InputIterator temp = input + (base + threadIdx.x);
141
val = thrust::detail::device::dereference(temp);
145
if (threadIdx.x == 0 && base != interval_begin)
147
OutputType tmp = sdata[blockDim.x - 1];
148
val = binary_op(tmp, val);
154
val = scan_block(sdata, val, binary_op);
158
OutputIterator temp = output + (base + threadIdx.x);
159
thrust::detail::device::dereference(temp) = val;
163
// process partially full block at end of input (if necessary)
164
if (base < interval_end)
167
if (base + threadIdx.x < interval_end)
169
InputIterator temp = input + (base + threadIdx.x);
170
val = thrust::detail::device::dereference(temp);
174
if (threadIdx.x == 0 && base != interval_begin)
176
OutputType tmp = sdata[blockDim.x - 1];
177
val = binary_op(tmp, val);
182
val = scan_block_n(sdata, interval_end - base, val, binary_op);
185
if (base + threadIdx.x < interval_end)
187
OutputIterator temp = output + (base + threadIdx.x);
188
thrust::detail::device::dereference(temp) = val;
194
// write interval sum
195
if (threadIdx.x == 0)
197
OutputIterator temp = output + (interval_end - 1);
198
block_results[blockIdx.x] = thrust::detail::device::dereference(temp);
203
template <typename OutputIterator,
205
typename BinaryFunction>
207
void inclusive_update(OutputIterator output,
208
const unsigned int N,
209
const unsigned int interval_size,
210
OutputType * block_results,
211
BinaryFunction binary_op)
213
const unsigned int interval_begin = interval_size * blockIdx.x;
214
const unsigned int interval_end = min(interval_begin + interval_size, N);
219
// value to add to this segment
220
OutputType sum = block_results[blockIdx.x - 1];
222
// advance result iterator
223
output += interval_begin + threadIdx.x;
225
for(unsigned int base = interval_begin; base < interval_end; base += blockDim.x, output += blockDim.x)
227
const unsigned int i = base + threadIdx.x;
231
OutputType tmp = thrust::detail::device::dereference(output);
232
thrust::detail::device::dereference(output) = binary_op(sum, tmp);
239
template <typename OutputIterator,
241
typename BinaryFunction>
243
void exclusive_update(OutputIterator output,
244
const unsigned int N,
245
const unsigned int interval_size,
246
OutputType * block_results,
247
BinaryFunction binary_op)
249
thrust::detail::device::cuda::extern_shared_ptr<OutputType> sdata;
251
const unsigned int interval_begin = interval_size * blockIdx.x;
252
const unsigned int interval_end = min(interval_begin + interval_size, N);
254
// value to add to this segment
255
OutputType carry = block_results[gridDim.x]; // init
258
OutputType tmp = block_results[blockIdx.x - 1];
259
carry = binary_op(carry, tmp);
262
OutputType val = carry;
264
// advance result iterator
265
output += interval_begin + threadIdx.x;
267
for(unsigned int base = interval_begin; base < interval_end; base += blockDim.x, output += blockDim.x)
269
const unsigned int i = base + threadIdx.x;
273
OutputType tmp = thrust::detail::device::dereference(output);
274
sdata[threadIdx.x] = binary_op(carry, tmp);
278
if (threadIdx.x != 0)
279
val = sdata[threadIdx.x - 1];
281
if (i < interval_end)
282
thrust::detail::device::dereference(output) = val;
285
val = sdata[blockDim.x - 1];
292
template <typename InputIterator,
293
typename OutputIterator,
294
typename BinaryFunction>
295
OutputIterator inclusive_scan(InputIterator first,
297
OutputIterator output,
298
BinaryFunction binary_op)
303
typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
305
const unsigned int N = last - first;
307
// determine maximal launch parameters
308
const unsigned int smem_per_thread = sizeof(OutputType);
309
const unsigned int block_size = thrust::detail::device::cuda::arch::max_blocksize_with_highest_occupancy(scan_intervals<InputIterator,OutputIterator,BinaryFunction>, smem_per_thread);
310
const unsigned int smem_size = block_size * smem_per_thread;
311
const unsigned int max_blocks = thrust::detail::device::cuda::arch::max_active_blocks(scan_intervals<InputIterator,OutputIterator,BinaryFunction>, block_size, smem_size);
313
// determine final launch parameters
314
const unsigned int unit_size = block_size;
315
const unsigned int num_units = thrust::detail::util::divide_ri(N, unit_size);
316
const unsigned int num_blocks = (std::min)(max_blocks, num_units);
317
const unsigned int num_iters = thrust::detail::util::divide_ri(num_units, num_blocks);
318
const unsigned int interval_size = unit_size * num_iters;
320
//std::cout << "N " << N << std::endl;
321
//std::cout << "max_blocks " << max_blocks << std::endl;
322
//std::cout << "unit_size " << unit_size << std::endl;
323
//std::cout << "num_blocks " << num_blocks << std::endl;
324
//std::cout << "num_iters " << num_iters << std::endl;
325
//std::cout << "interval_size " << interval_size << std::endl;
327
thrust::detail::raw_cuda_device_buffer<OutputType> block_results(num_blocks + 1);
329
// first level scan of interval (one interval per block)
331
scan_intervals<<<num_blocks, block_size, smem_size>>>
336
thrust::raw_pointer_cast(&block_results[0]),
338
synchronize_if_enabled("scan_intervals");
341
// second level inclusive scan of per-block results
343
const unsigned int block_size_pass2 = thrust::detail::device::cuda::arch::max_blocksize(scan_intervals<OutputType *, OutputType *, BinaryFunction>, smem_per_thread);
344
const unsigned int smem_size_pass2 = smem_per_thread * block_size_pass2;
346
scan_intervals<<< 1, block_size_pass2, smem_size_pass2>>>
347
(thrust::raw_pointer_cast(&block_results[0]),
350
thrust::raw_pointer_cast(&block_results[0]),
351
thrust::raw_pointer_cast(&block_results[0]) + num_blocks,
353
synchronize_if_enabled("scan_intervals");
356
// update intervals with result of second level scan
358
const unsigned int block_size_pass3 = thrust::detail::device::cuda::arch::max_blocksize_with_highest_occupancy(inclusive_update<OutputIterator,OutputType,BinaryFunction>, 0);
360
inclusive_update<<<num_blocks, block_size_pass3>>>
364
thrust::raw_pointer_cast(&block_results[0]),
366
synchronize_if_enabled("inclusive_update");
373
template <typename InputIterator,
374
typename OutputIterator,
376
typename BinaryFunction>
377
OutputIterator exclusive_scan(InputIterator first,
379
OutputIterator output,
381
BinaryFunction binary_op)
386
typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
388
const unsigned int N = last - first;
390
// determine maximal launch parameters
391
const unsigned int smem_per_thread = sizeof(OutputType);
392
const unsigned int block_size = thrust::detail::device::cuda::arch::max_blocksize_with_highest_occupancy(scan_intervals<InputIterator,OutputIterator,BinaryFunction>, smem_per_thread);
393
const unsigned int smem_size = block_size * smem_per_thread;
394
const unsigned int max_blocks = thrust::detail::device::cuda::arch::max_active_blocks(scan_intervals<InputIterator,OutputIterator,BinaryFunction>, block_size, smem_size);
396
// determine final launch parameters
397
const unsigned int unit_size = block_size;
398
const unsigned int num_units = thrust::detail::util::divide_ri(N, unit_size);
399
const unsigned int num_blocks = (std::min)(max_blocks, num_units);
400
const unsigned int num_iters = thrust::detail::util::divide_ri(num_units, num_blocks);
401
const unsigned int interval_size = unit_size * num_iters;
403
//std::cout << "N " << N << std::endl;
404
//std::cout << "max_blocks " << max_blocks << std::endl;
405
//std::cout << "unit_size " << unit_size << std::endl;
406
//std::cout << "num_blocks " << num_blocks << std::endl;
407
//std::cout << "num_iters " << num_iters << std::endl;
408
//std::cout << "interval_size " << interval_size << std::endl;
410
thrust::detail::raw_cuda_device_buffer<OutputType> block_results(num_blocks + 1);
412
// first level scan of interval (one interval per block)
414
scan_intervals<<<num_blocks, block_size, smem_size>>>
419
thrust::raw_pointer_cast(&block_results[0]),
421
synchronize_if_enabled("scan_intervals");
425
// second level inclusive scan of per-block results
427
const unsigned int block_size_pass2 = thrust::detail::device::cuda::arch::max_blocksize(scan_intervals<OutputType *, OutputType *, BinaryFunction>, smem_per_thread);
428
const unsigned int smem_size_pass2 = smem_per_thread * block_size_pass2;
430
scan_intervals<<< 1, block_size_pass2, smem_size_pass2>>>
431
(thrust::raw_pointer_cast(&block_results[0]),
434
thrust::raw_pointer_cast(&block_results[0]),
435
thrust::raw_pointer_cast(&block_results[0]) + num_blocks,
437
synchronize_if_enabled("scan_intervals");
440
// copy the initial value to the device
441
block_results[num_blocks] = init;
443
// update intervals with result of second level scan
445
const unsigned int block_size_pass3 = thrust::detail::device::cuda::arch::max_blocksize_with_highest_occupancy(exclusive_update<OutputIterator,OutputType,BinaryFunction>, smem_per_thread);
446
const unsigned int smem_size_pass3 = smem_per_thread * block_size_pass3;
448
exclusive_update<<<num_blocks, block_size_pass3, smem_size_pass3>>>
452
thrust::raw_pointer_cast(&block_results[0]),
454
synchronize_if_enabled("exclusive_update");
460
} // end namespace safe_scan
461
} // end namespace detail
462
} // end namespace cuda
463
} // end namespace device
464
} // end namespace detail
465
} // end namespace thrust
467
__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
469
#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC