~ubuntu-branches/ubuntu/trusty/libthrust/trusty

« back to all changes in this revision

Viewing changes to detail/device/cuda/detail/safe_scan.inl

  • Committer: Bazaar Package Importer
  • Author(s): Andreas Beckmann
  • Date: 2011-05-28 09:32:48 UTC
  • Revision ID: james.westby@ubuntu.com-20110528093248-np3euv5sj7fw3nyv
Tags: upstream-1.4.0
ImportĀ upstreamĀ versionĀ 1.4.0

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
/*
 
2
 *  Copyright 2008-2011 NVIDIA Corporation
 
3
 *
 
4
 *  Licensed under the Apache License, Version 2.0 (the "License");
 
5
 *  you may not use this file except in compliance with the License.
 
6
 *  You may obtain a copy of the License at
 
7
 *
 
8
 *      http://www.apache.org/licenses/LICENSE-2.0
 
9
 *
 
10
 *  Unless required by applicable law or agreed to in writing, software
 
11
 *  distributed under the License is distributed on an "AS IS" BASIS,
 
12
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 
13
 *  See the License for the specific language governing permissions and
 
14
 *  limitations under the License.
 
15
 */
 
16
 
 
17
 
 
18
/*! \file safe_scan.h
 
19
 *  \brief A robust scan for general types.
 
20
 */
 
21
 
 
22
#pragma once
 
23
 
 
24
#include <thrust/detail/config.h>
 
25
 
 
26
// do not attempt to compile this file with any other compiler
 
27
#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 
28
 
 
29
#include <thrust/iterator/iterator_traits.h>
 
30
 
 
31
#include <thrust/detail/util/blocking.h>
 
32
 
 
33
#include <thrust/detail/raw_buffer.h>
 
34
#include <thrust/detail/device/dereference.h>
 
35
 
 
36
#include <thrust/detail/device/cuda/extern_shared_ptr.h>
 
37
#include <thrust/detail/device/cuda/synchronize.h>
 
38
 
 
39
// to configure launch parameters
 
40
#include <thrust/detail/device/cuda/arch.h>
 
41
 
 
42
 
 
43
__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
 
44
 
 
45
 
 
46
namespace thrust
 
47
{
 
48
namespace detail
 
49
{
 
50
 
 
51
// forward declaration of raw_cuda_device_buffer
 
52
template<typename> class raw_cuda_device_buffer;
 
53
 
 
54
namespace device
 
55
{
 
56
namespace cuda
 
57
{
 
58
namespace detail
 
59
{
 
60
namespace safe_scan
 
61
{
 
62
 
 
63
 
 
64
template <typename SharedArray,
 
65
          typename T,
 
66
          typename BinaryFunction>
 
67
          __device__
 
68
T scan_block(SharedArray array, T val, BinaryFunction binary_op)
 
69
{
 
70
    array[threadIdx.x] = val;
 
71
 
 
72
    __syncthreads();
 
73
 
 
74
    // copy to temporary so val and tmp have the same memory space
 
75
    if (blockDim.x >   1) { if(threadIdx.x >=   1) { T tmp = array[threadIdx.x -   1]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
 
76
    if (blockDim.x >   2) { if(threadIdx.x >=   2) { T tmp = array[threadIdx.x -   2]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
 
77
    if (blockDim.x >   4) { if(threadIdx.x >=   4) { T tmp = array[threadIdx.x -   4]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
 
78
    if (blockDim.x >   8) { if(threadIdx.x >=   8) { T tmp = array[threadIdx.x -   8]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
 
79
    if (blockDim.x >  16) { if(threadIdx.x >=  16) { T tmp = array[threadIdx.x -  16]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
 
80
    if (blockDim.x >  32) { if(threadIdx.x >=  32) { T tmp = array[threadIdx.x -  32]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
 
81
    if (blockDim.x >  64) { if(threadIdx.x >=  64) { T tmp = array[threadIdx.x -  64]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
 
82
    if (blockDim.x > 128) { if(threadIdx.x >= 128) { T tmp = array[threadIdx.x - 128]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
 
83
    if (blockDim.x > 256) { if(threadIdx.x >= 256) { T tmp = array[threadIdx.x - 256]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }  
 
84
    if (blockDim.x > 512) { if(threadIdx.x >= 512) { T tmp = array[threadIdx.x - 512]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }  
 
85
 
 
86
    return val;
 
87
}
 
88
 
 
89
template <typename SharedArray,
 
90
          typename T,
 
91
          typename BinaryFunction>
 
92
          __device__
 
93
T scan_block_n(SharedArray array, const unsigned int n, T val, BinaryFunction binary_op)
 
94
{
 
95
    array[threadIdx.x] = val;
 
96
 
 
97
    __syncthreads();
 
98
 
 
99
    if (blockDim.x >   1) { if(threadIdx.x < n && threadIdx.x >=   1) { T tmp = array[threadIdx.x -   1]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
 
100
    if (blockDim.x >   2) { if(threadIdx.x < n && threadIdx.x >=   2) { T tmp = array[threadIdx.x -   2]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
 
101
    if (blockDim.x >   4) { if(threadIdx.x < n && threadIdx.x >=   4) { T tmp = array[threadIdx.x -   4]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
 
102
    if (blockDim.x >   8) { if(threadIdx.x < n && threadIdx.x >=   8) { T tmp = array[threadIdx.x -   8]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
 
103
    if (blockDim.x >  16) { if(threadIdx.x < n && threadIdx.x >=  16) { T tmp = array[threadIdx.x -  16]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
 
104
    if (blockDim.x >  32) { if(threadIdx.x < n && threadIdx.x >=  32) { T tmp = array[threadIdx.x -  32]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
 
105
    if (blockDim.x >  64) { if(threadIdx.x < n && threadIdx.x >=  64) { T tmp = array[threadIdx.x -  64]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
 
106
    if (blockDim.x > 128) { if(threadIdx.x < n && threadIdx.x >= 128) { T tmp = array[threadIdx.x - 128]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
 
107
    if (blockDim.x > 256) { if(threadIdx.x < n && threadIdx.x >= 256) { T tmp = array[threadIdx.x - 256]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
 
108
    if (blockDim.x > 512) { if(threadIdx.x < n && threadIdx.x >= 512) { T tmp = array[threadIdx.x - 512]; val = binary_op(tmp, val); } __syncthreads(); array[threadIdx.x] = val; __syncthreads(); }
 
109
 
 
110
    return val;
 
111
}
 
112
 
 
113
template <typename InputIterator,
 
114
          typename OutputIterator,
 
115
          typename BinaryFunction>
 
116
__global__
 
117
void scan_intervals(InputIterator input,
 
118
                    const unsigned int N,
 
119
                    const unsigned int interval_size,
 
120
                    OutputIterator output,
 
121
                    typename thrust::iterator_value<OutputIterator>::type * block_results,
 
122
                    BinaryFunction binary_op)
 
123
{
 
124
    typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
 
125
 
 
126
    thrust::detail::device::cuda::extern_shared_ptr<OutputType> sdata;
 
127
    
 
128
    const unsigned int interval_begin = interval_size * blockIdx.x;
 
129
    const unsigned int interval_end   = min(interval_begin + interval_size, N);
 
130
 
 
131
    unsigned int base = interval_begin;
 
132
 
 
133
    OutputType val;
 
134
 
 
135
    // process full blocks
 
136
    for(; base + blockDim.x <= interval_end; base += blockDim.x)
 
137
    {
 
138
        // read data
 
139
        {
 
140
            InputIterator temp = input + (base + threadIdx.x);
 
141
            val = thrust::detail::device::dereference(temp);
 
142
        }
 
143
       
 
144
        // carry in
 
145
        if (threadIdx.x == 0 && base != interval_begin)
 
146
        {
 
147
            OutputType tmp = sdata[blockDim.x - 1];
 
148
            val = binary_op(tmp, val);
 
149
        }
 
150
 
 
151
        __syncthreads();
 
152
 
 
153
        // scan block
 
154
        val = scan_block(sdata, val, binary_op);
 
155
       
 
156
        // write data
 
157
        {
 
158
            OutputIterator temp = output + (base + threadIdx.x);
 
159
            thrust::detail::device::dereference(temp) = val;
 
160
        }   
 
161
    }
 
162
 
 
163
    // process partially full block at end of input (if necessary)
 
164
    if (base < interval_end)
 
165
    {
 
166
        // read data
 
167
        if (base + threadIdx.x < interval_end)
 
168
        {
 
169
            InputIterator temp = input + (base + threadIdx.x);
 
170
            val = thrust::detail::device::dereference(temp);
 
171
        }
 
172
       
 
173
        // carry in
 
174
        if (threadIdx.x == 0 && base != interval_begin)
 
175
        {
 
176
            OutputType tmp = sdata[blockDim.x - 1];
 
177
            val = binary_op(tmp, val);
 
178
        }
 
179
        __syncthreads();
 
180
 
 
181
        // scan block
 
182
        val = scan_block_n(sdata, interval_end - base, val, binary_op);
 
183
       
 
184
        // write data
 
185
        if (base + threadIdx.x < interval_end)
 
186
        {
 
187
            OutputIterator temp = output + (base + threadIdx.x);
 
188
            thrust::detail::device::dereference(temp) = val;
 
189
        }   
 
190
    }
 
191
 
 
192
    __syncthreads();
 
193
    
 
194
    // write interval sum
 
195
    if (threadIdx.x == 0)
 
196
    {
 
197
        OutputIterator temp = output + (interval_end - 1);
 
198
        block_results[blockIdx.x] = thrust::detail::device::dereference(temp);
 
199
    }
 
200
}
 
201
 
 
202
 
 
203
template <typename OutputIterator,
 
204
          typename OutputType,
 
205
          typename BinaryFunction>
 
206
__global__
 
207
void inclusive_update(OutputIterator output,
 
208
                      const unsigned int N,
 
209
                      const unsigned int interval_size,
 
210
                      OutputType *   block_results,
 
211
                      BinaryFunction binary_op)
 
212
{
 
213
    const unsigned int interval_begin = interval_size * blockIdx.x;
 
214
    const unsigned int interval_end   = min(interval_begin + interval_size, N);
 
215
 
 
216
    if (blockIdx.x == 0)
 
217
        return;
 
218
 
 
219
    // value to add to this segment 
 
220
    OutputType sum = block_results[blockIdx.x - 1];
 
221
    
 
222
    // advance result iterator
 
223
    output += interval_begin + threadIdx.x;
 
224
    
 
225
    for(unsigned int base = interval_begin; base < interval_end; base += blockDim.x, output += blockDim.x)
 
226
    {
 
227
        const unsigned int i = base + threadIdx.x;
 
228
 
 
229
        if(i < interval_end)
 
230
        {
 
231
            OutputType tmp = thrust::detail::device::dereference(output);
 
232
            thrust::detail::device::dereference(output) = binary_op(sum, tmp);
 
233
        }
 
234
 
 
235
        __syncthreads();
 
236
    }
 
237
}
 
238
 
 
239
template <typename OutputIterator,
 
240
          typename OutputType,
 
241
          typename BinaryFunction>
 
242
__global__
 
243
void exclusive_update(OutputIterator output,
 
244
                      const unsigned int N,
 
245
                      const unsigned int interval_size,
 
246
                      OutputType * block_results,
 
247
                      BinaryFunction binary_op)
 
248
{
 
249
    thrust::detail::device::cuda::extern_shared_ptr<OutputType> sdata;
 
250
 
 
251
    const unsigned int interval_begin = interval_size * blockIdx.x;
 
252
    const unsigned int interval_end   = min(interval_begin + interval_size, N);
 
253
 
 
254
    // value to add to this segment 
 
255
    OutputType carry = block_results[gridDim.x]; // init
 
256
    if (blockIdx.x != 0)
 
257
    {
 
258
        OutputType tmp = block_results[blockIdx.x - 1];
 
259
        carry = binary_op(carry, tmp);
 
260
    }
 
261
 
 
262
    OutputType val = carry;
 
263
 
 
264
    // advance result iterator
 
265
    output += interval_begin + threadIdx.x;
 
266
 
 
267
    for(unsigned int base = interval_begin; base < interval_end; base += blockDim.x, output += blockDim.x)
 
268
    {
 
269
        const unsigned int i = base + threadIdx.x;
 
270
 
 
271
        if(i < interval_end)
 
272
        {
 
273
            OutputType tmp = thrust::detail::device::dereference(output);
 
274
            sdata[threadIdx.x] = binary_op(carry, tmp);
 
275
        }
 
276
        __syncthreads();
 
277
 
 
278
        if (threadIdx.x != 0)
 
279
            val = sdata[threadIdx.x - 1];
 
280
 
 
281
        if (i < interval_end)
 
282
            thrust::detail::device::dereference(output) = val;
 
283
 
 
284
        if(threadIdx.x == 0)
 
285
            val = sdata[blockDim.x - 1];
 
286
        
 
287
        __syncthreads();
 
288
    }
 
289
}
 
290
 
 
291
 
 
292
template <typename InputIterator,
 
293
          typename OutputIterator,
 
294
          typename BinaryFunction>
 
295
OutputIterator inclusive_scan(InputIterator first,
 
296
                              InputIterator last,
 
297
                              OutputIterator output,
 
298
                              BinaryFunction binary_op)
 
299
{
 
300
    if (first == last)
 
301
        return output;
 
302
 
 
303
    typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
 
304
 
 
305
    const unsigned int N = last - first;
 
306
    
 
307
    // determine maximal launch parameters
 
308
    const unsigned int smem_per_thread = sizeof(OutputType);
 
309
    const unsigned int block_size = thrust::detail::device::cuda::arch::max_blocksize_with_highest_occupancy(scan_intervals<InputIterator,OutputIterator,BinaryFunction>, smem_per_thread);
 
310
    const unsigned int smem_size  = block_size * smem_per_thread;
 
311
    const unsigned int max_blocks = thrust::detail::device::cuda::arch::max_active_blocks(scan_intervals<InputIterator,OutputIterator,BinaryFunction>, block_size, smem_size);
 
312
 
 
313
    // determine final launch parameters
 
314
    const unsigned int unit_size     = block_size;
 
315
    const unsigned int num_units     = thrust::detail::util::divide_ri(N, unit_size);
 
316
    const unsigned int num_blocks    = (std::min)(max_blocks, num_units);
 
317
    const unsigned int num_iters     = thrust::detail::util::divide_ri(num_units, num_blocks);
 
318
    const unsigned int interval_size = unit_size * num_iters;
 
319
    
 
320
    //std::cout << "N             " << N << std::endl;
 
321
    //std::cout << "max_blocks    " << max_blocks    << std::endl;
 
322
    //std::cout << "unit_size     " << unit_size     << std::endl;
 
323
    //std::cout << "num_blocks    " << num_blocks    << std::endl;
 
324
    //std::cout << "num_iters     " << num_iters     << std::endl;
 
325
    //std::cout << "interval_size " << interval_size << std::endl;
 
326
 
 
327
    thrust::detail::raw_cuda_device_buffer<OutputType> block_results(num_blocks + 1);
 
328
                
 
329
    // first level scan of interval (one interval per block)
 
330
    {
 
331
        scan_intervals<<<num_blocks, block_size, smem_size>>>
 
332
            (first,
 
333
             N,
 
334
             interval_size,
 
335
             output,
 
336
             thrust::raw_pointer_cast(&block_results[0]),
 
337
             binary_op);
 
338
        synchronize_if_enabled("scan_intervals");
 
339
    }
 
340
  
 
341
    // second level inclusive scan of per-block results
 
342
    {
 
343
        const unsigned int block_size_pass2 = thrust::detail::device::cuda::arch::max_blocksize(scan_intervals<OutputType *, OutputType *, BinaryFunction>, smem_per_thread);
 
344
        const unsigned int smem_size_pass2  = smem_per_thread * block_size_pass2;
 
345
 
 
346
        scan_intervals<<<         1, block_size_pass2, smem_size_pass2>>>
 
347
            (thrust::raw_pointer_cast(&block_results[0]),
 
348
             num_blocks,
 
349
             interval_size,
 
350
             thrust::raw_pointer_cast(&block_results[0]),
 
351
             thrust::raw_pointer_cast(&block_results[0]) + num_blocks,
 
352
             binary_op);
 
353
        synchronize_if_enabled("scan_intervals");
 
354
    }
 
355
   
 
356
    // update intervals with result of second level scan
 
357
    {
 
358
        const unsigned int block_size_pass3 = thrust::detail::device::cuda::arch::max_blocksize_with_highest_occupancy(inclusive_update<OutputIterator,OutputType,BinaryFunction>, 0);
 
359
 
 
360
        inclusive_update<<<num_blocks, block_size_pass3>>>
 
361
            (output,
 
362
             N,
 
363
             interval_size,
 
364
             thrust::raw_pointer_cast(&block_results[0]),
 
365
             binary_op);
 
366
        synchronize_if_enabled("inclusive_update");
 
367
    }
 
368
 
 
369
    return output + N;
 
370
}
 
371
 
 
372
 
 
373
template <typename InputIterator,
 
374
          typename OutputIterator,
 
375
          typename T,
 
376
          typename BinaryFunction>
 
377
OutputIterator exclusive_scan(InputIterator first,
 
378
                              InputIterator last,
 
379
                              OutputIterator output,
 
380
                              const T init,
 
381
                              BinaryFunction binary_op)
 
382
{
 
383
    if (first == last)
 
384
        return output;
 
385
 
 
386
    typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
 
387
 
 
388
    const unsigned int N = last - first;
 
389
    
 
390
    // determine maximal launch parameters
 
391
    const unsigned int smem_per_thread = sizeof(OutputType);
 
392
    const unsigned int block_size = thrust::detail::device::cuda::arch::max_blocksize_with_highest_occupancy(scan_intervals<InputIterator,OutputIterator,BinaryFunction>, smem_per_thread);
 
393
    const unsigned int smem_size  = block_size * smem_per_thread;
 
394
    const unsigned int max_blocks = thrust::detail::device::cuda::arch::max_active_blocks(scan_intervals<InputIterator,OutputIterator,BinaryFunction>, block_size, smem_size);
 
395
 
 
396
    // determine final launch parameters
 
397
    const unsigned int unit_size     = block_size;
 
398
    const unsigned int num_units     = thrust::detail::util::divide_ri(N, unit_size);
 
399
    const unsigned int num_blocks    = (std::min)(max_blocks, num_units);
 
400
    const unsigned int num_iters     = thrust::detail::util::divide_ri(num_units, num_blocks);
 
401
    const unsigned int interval_size = unit_size * num_iters;
 
402
    
 
403
    //std::cout << "N             " << N << std::endl;
 
404
    //std::cout << "max_blocks    " << max_blocks    << std::endl;
 
405
    //std::cout << "unit_size     " << unit_size     << std::endl;
 
406
    //std::cout << "num_blocks    " << num_blocks    << std::endl;
 
407
    //std::cout << "num_iters     " << num_iters     << std::endl;
 
408
    //std::cout << "interval_size " << interval_size << std::endl;
 
409
 
 
410
    thrust::detail::raw_cuda_device_buffer<OutputType> block_results(num_blocks + 1);
 
411
                
 
412
    // first level scan of interval (one interval per block)
 
413
    {
 
414
        scan_intervals<<<num_blocks, block_size, smem_size>>>
 
415
            (first,
 
416
             N,
 
417
             interval_size,
 
418
             output,
 
419
             thrust::raw_pointer_cast(&block_results[0]),
 
420
             binary_op);
 
421
        synchronize_if_enabled("scan_intervals");
 
422
    }
 
423
        
 
424
    
 
425
    // second level inclusive scan of per-block results
 
426
    {
 
427
        const unsigned int block_size_pass2 = thrust::detail::device::cuda::arch::max_blocksize(scan_intervals<OutputType *, OutputType *, BinaryFunction>, smem_per_thread);
 
428
        const unsigned int smem_size_pass2  = smem_per_thread * block_size_pass2;
 
429
 
 
430
        scan_intervals<<<         1, block_size_pass2, smem_size_pass2>>>
 
431
            (thrust::raw_pointer_cast(&block_results[0]),
 
432
             num_blocks,
 
433
             interval_size,
 
434
             thrust::raw_pointer_cast(&block_results[0]),
 
435
             thrust::raw_pointer_cast(&block_results[0]) + num_blocks,
 
436
             binary_op);
 
437
        synchronize_if_enabled("scan_intervals");
 
438
    }
 
439
 
 
440
    // copy the initial value to the device
 
441
    block_results[num_blocks] = init;
 
442
 
 
443
    // update intervals with result of second level scan
 
444
    {
 
445
        const unsigned int block_size_pass3 = thrust::detail::device::cuda::arch::max_blocksize_with_highest_occupancy(exclusive_update<OutputIterator,OutputType,BinaryFunction>, smem_per_thread);
 
446
        const unsigned int smem_size_pass3  = smem_per_thread * block_size_pass3;
 
447
 
 
448
        exclusive_update<<<num_blocks, block_size_pass3, smem_size_pass3>>>
 
449
            (output,
 
450
             N,
 
451
             interval_size,
 
452
             thrust::raw_pointer_cast(&block_results[0]),
 
453
             binary_op);
 
454
        synchronize_if_enabled("exclusive_update");
 
455
    }
 
456
 
 
457
    return output + N;
 
458
}
 
459
 
 
460
} // end namespace safe_scan
 
461
} // end namespace detail
 
462
} // end namespace cuda
 
463
} // end namespace device
 
464
} // end namespace detail
 
465
} // end namespace thrust
 
466
 
 
467
__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
 
468
 
 
469
#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 
470