3
// ************************************************************************
5
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
6
// Copyright (2012) Sandia Corporation
8
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9
// the U.S. Government retains certain rights in this software.
11
// Redistribution and use in source and binary forms, with or without
12
// modification, are permitted provided that the following conditions are
15
// 1. Redistributions of source code must retain the above copyright
16
// notice, this list of conditions and the following disclaimer.
18
// 2. Redistributions in binary form must reproduce the above copyright
19
// notice, this list of conditions and the following disclaimer in the
20
// documentation and/or other materials provided with the distribution.
22
// 3. Neither the name of the Corporation nor the names of the
23
// contributors may be used to endorse or promote products derived from
24
// this software without specific prior written permission.
26
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
40
// ************************************************************************
44
#ifndef KOKKOS_CUDA_REDUCESCAN_HPP
45
#define KOKKOS_CUDA_REDUCESCAN_HPP
47
#if defined( __CUDACC__ )
51
#include <Kokkos_Parallel.hpp>
52
#include <impl/Kokkos_FunctorAdapter.hpp>
53
#include <impl/Kokkos_Error.hpp>
54
#include <Cuda/Kokkos_Cuda_Vectorization.hpp>
55
//----------------------------------------------------------------------------
56
//----------------------------------------------------------------------------
63
//Shfl based reductions
65
* Algorithmic constraints:
66
* (a) threads with same threadIdx.y have same value
67
* (b) blockDim.x == power of two
71
template< class ValueType , class JoinOp>
73
inline void cuda_intra_warp_reduction( ValueType& result,
75
const int max_active_thread = blockDim.y) {
77
unsigned int shift = 1;
79
//Reduce over values from threads with different threadIdx.y
80
while(blockDim.x * shift < 32 ) {
81
const ValueType tmp = shfl_down(result, blockDim.x*shift,32u);
82
//Only join if upper thread is active (this allows non power of two for blockDim.y
83
if(threadIdx.y + shift < max_active_thread)
88
result = shfl(result,0,32);
91
template< class ValueType , class JoinOp>
93
inline void cuda_inter_warp_reduction( ValueType& value,
95
const int max_active_thread = blockDim.y) {
98
__shared__ char sh_result[sizeof(ValueType)*STEP_WIDTH];
99
ValueType* result = (ValueType*) & sh_result;
100
const unsigned step = 32 / blockDim.x;
101
unsigned shift = STEP_WIDTH;
102
const int id = threadIdx.y%step==0?threadIdx.y/step:65000;
103
if(id < STEP_WIDTH ) {
107
while (shift<=max_active_thread/step) {
108
if(shift<=id && shift+STEP_WIDTH>id && threadIdx.x==0) {
109
join(result[id%STEP_WIDTH],value);
117
for(int i = 1; (i*step<=max_active_thread) && i<STEP_WIDTH; i++)
118
join(value,result[i]);
121
template< class ValueType , class JoinOp>
123
inline void cuda_intra_block_reduction( ValueType& value,
125
const int max_active_thread = blockDim.y) {
126
cuda_intra_warp_reduction(value,join,max_active_thread);
127
cuda_inter_warp_reduction(value,join,max_active_thread);
130
template< class FunctorType , class JoinOp>
132
bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , void >::reference_type value,
134
Cuda::size_type * const m_scratch_space,
135
typename FunctorValueTraits< FunctorType , void >::pointer_type const result,
136
Cuda::size_type * const m_scratch_flags,
137
const int max_active_thread = blockDim.y) {
138
typedef typename FunctorValueTraits< FunctorType , void >::pointer_type pointer_type;
139
typedef typename FunctorValueTraits< FunctorType , void >::value_type value_type;
141
//Do the intra-block reduction with shfl operations and static shared memory
142
cuda_intra_block_reduction(value,join,max_active_thread);
144
const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
146
//One thread in the block writes block result to global scratch_memory
148
pointer_type global = ((pointer_type) m_scratch_space) + blockIdx.x;
152
//One warp of last block performs inter block reduction through loading the block values from global scratch_memory
153
bool last_block = false;
157
Cuda::size_type count;
159
//Figure out whether this is the last block
161
count = Kokkos::atomic_fetch_add(m_scratch_flags,1);
162
count = Kokkos::shfl(count,0,32);
164
//Last block does the inter block reduction
165
if( count == gridDim.x - 1) {
166
//set flag back to zero
168
*m_scratch_flags = 0;
172
pointer_type const volatile global = (pointer_type) m_scratch_space ;
174
//Reduce all global values with splitting work over threads in one warp
175
const int step_size = blockDim.x*blockDim.y < 32 ? blockDim.x*blockDim.y : 32;
176
for(int i=id; i<gridDim.x; i+=step_size) {
177
value_type tmp = global[i];
181
//Perform shfl reductions within the warp only join if contribution is valid (allows gridDim.x non power of two and <32)
182
if (blockDim.x*blockDim.y > 1) {
183
value_type tmp = Kokkos::shfl_down(value, 1,32);
184
if( id + 1 < gridDim.x )
187
if (blockDim.x*blockDim.y > 2) {
188
value_type tmp = Kokkos::shfl_down(value, 2,32);
189
if( id + 2 < gridDim.x )
192
if (blockDim.x*blockDim.y > 4) {
193
value_type tmp = Kokkos::shfl_down(value, 4,32);
194
if( id + 4 < gridDim.x )
197
if (blockDim.x*blockDim.y > 8) {
198
value_type tmp = Kokkos::shfl_down(value, 8,32);
199
if( id + 8 < gridDim.x )
202
if (blockDim.x*blockDim.y > 16) {
203
value_type tmp = Kokkos::shfl_down(value, 16,32);
204
if( id + 16 < gridDim.x )
210
//The last block has in its thread=0 the global reduction value through "value"
214
//----------------------------------------------------------------------------
215
// See section B.17 of Cuda C Programming Guide Version 3.2
217
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
218
// function qualifier which could be used to improve performance.
219
//----------------------------------------------------------------------------
220
// Maximize shared memory and minimize L1 cache:
221
// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
222
// For 2.0 capability: 48 KB shared and 16 KB L1
223
//----------------------------------------------------------------------------
224
//----------------------------------------------------------------------------
226
* Algorithmic constraints:
227
* (a) blockDim.y is a power of two
228
* (b) blockDim.y <= 512
229
* (c) blockDim.x == blockDim.z == 1
232
template< bool DoScan , class FunctorType , class ArgTag >
234
void cuda_intra_block_reduce_scan( const FunctorType & functor ,
235
const typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type base_data )
237
typedef FunctorValueTraits< FunctorType , ArgTag > ValueTraits ;
238
typedef FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
240
typedef typename ValueTraits::pointer_type pointer_type ;
242
const unsigned value_count = ValueTraits::value_count( functor );
243
const unsigned BlockSizeMask = blockDim.y - 1 ;
245
// Must have power of two thread count
247
if ( BlockSizeMask & blockDim.y ) { Kokkos::abort("Cuda::cuda_intra_block_scan requires power-of-two blockDim"); }
249
#define BLOCK_REDUCE_STEP( R , TD , S ) \
250
if ( ! ( R & ((1<<(S+1))-1) ) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S)) ); }
252
#define BLOCK_SCAN_STEP( TD , N , S ) \
253
if ( N == (1<<S) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S))); }
255
const unsigned rtid_intra = threadIdx.y ^ BlockSizeMask ;
256
const pointer_type tdata_intra = base_data + value_count * threadIdx.y ;
258
{ // Intra-warp reduction:
259
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,0)
260
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,1)
261
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,2)
262
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,3)
263
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,4)
266
__syncthreads(); // Wait for all warps to reduce
268
{ // Inter-warp reduce-scan by a single warp to avoid extra synchronizations
269
const unsigned rtid_inter = ( threadIdx.y ^ BlockSizeMask ) << CudaTraits::WarpIndexShift ;
271
if ( rtid_inter < blockDim.y ) {
273
const pointer_type tdata_inter = base_data + value_count * ( rtid_inter ^ BlockSizeMask );
275
if ( (1<<5) < BlockSizeMask ) { BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,5) }
276
if ( (1<<6) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,6) }
277
if ( (1<<7) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,7) }
278
if ( (1<<8) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,8) }
282
int n = ( rtid_inter & 32 ) ? 32 : (
283
( rtid_inter & 64 ) ? 64 : (
284
( rtid_inter & 128 ) ? 128 : (
285
( rtid_inter & 256 ) ? 256 : 0 )));
287
if ( ! ( rtid_inter + n < blockDim.y ) ) n = 0 ;
289
BLOCK_SCAN_STEP(tdata_inter,n,8)
290
BLOCK_SCAN_STEP(tdata_inter,n,7)
291
BLOCK_SCAN_STEP(tdata_inter,n,6)
292
BLOCK_SCAN_STEP(tdata_inter,n,5)
297
__syncthreads(); // Wait for inter-warp reduce-scan to complete
300
int n = ( rtid_intra & 1 ) ? 1 : (
301
( rtid_intra & 2 ) ? 2 : (
302
( rtid_intra & 4 ) ? 4 : (
303
( rtid_intra & 8 ) ? 8 : (
304
( rtid_intra & 16 ) ? 16 : 0 ))));
306
if ( ! ( rtid_intra + n < blockDim.y ) ) n = 0 ;
308
BLOCK_SCAN_STEP(tdata_intra,n,4) __threadfence_block();
309
BLOCK_SCAN_STEP(tdata_intra,n,3) __threadfence_block();
310
BLOCK_SCAN_STEP(tdata_intra,n,2) __threadfence_block();
311
BLOCK_SCAN_STEP(tdata_intra,n,1) __threadfence_block();
312
BLOCK_SCAN_STEP(tdata_intra,n,0)
315
#undef BLOCK_SCAN_STEP
316
#undef BLOCK_REDUCE_STEP
319
//----------------------------------------------------------------------------
320
/**\brief Input value-per-thread starting at 'shared_data'.
321
* Reduction value at last thread's location.
323
* If 'DoScan' then write blocks' scan values and block-groups' scan values.
325
* Global reduce result is in the last threads' 'shared_data' location.
327
template< bool DoScan , class FunctorType , class ArgTag >
329
bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
330
const Cuda::size_type block_id ,
331
const Cuda::size_type block_count ,
332
Cuda::size_type * const shared_data ,
333
Cuda::size_type * const global_data ,
334
Cuda::size_type * const global_flags )
336
typedef Cuda::size_type size_type ;
337
typedef FunctorValueTraits< FunctorType , ArgTag > ValueTraits ;
338
typedef FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
339
typedef FunctorValueInit< FunctorType , ArgTag > ValueInit ;
340
typedef FunctorValueOps< FunctorType , ArgTag > ValueOps ;
342
typedef typename ValueTraits::pointer_type pointer_type ;
343
typedef typename ValueTraits::reference_type reference_type ;
345
const unsigned BlockSizeMask = blockDim.y - 1 ;
346
const unsigned BlockSizeShift = power_of_two_if_valid( blockDim.y );
348
// Must have power of two thread count
349
if ( BlockSizeMask & blockDim.y ) { Kokkos::abort("Cuda::cuda_single_inter_block_reduce_scan requires power-of-two blockDim"); }
351
const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
352
word_count( ValueTraits::value_size( functor ) / sizeof(size_type) );
354
// Reduce the accumulation for the entire block.
355
cuda_intra_block_reduce_scan<false,FunctorType,ArgTag>( functor , pointer_type(shared_data) );
358
// Write accumulation total to global scratch space.
359
// Accumulation total is the last thread's data.
360
size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
361
size_type * const global = global_data + word_count.value * block_id ;
363
for ( size_type i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i] ; }
366
// Contributing blocks note that their contribution has been completed via an atomic-increment flag
367
// If this block is not the last block to contribute to this group then the block is done.
368
const bool is_last_block =
369
! __syncthreads_or( threadIdx.y ? 0 : ( 1 + atomicInc( global_flags , block_count - 1 ) < block_count ) );
371
if ( is_last_block ) {
373
const size_type b = ( long(block_count) * long(threadIdx.y) ) >> BlockSizeShift ;
374
const size_type e = ( long(block_count) * long( threadIdx.y + 1 ) ) >> BlockSizeShift ;
377
void * const shared_ptr = shared_data + word_count.value * threadIdx.y ;
378
reference_type shared_value = ValueInit::init( functor , shared_ptr );
380
for ( size_type i = b ; i < e ; ++i ) {
381
ValueJoin::join( functor , shared_ptr , global_data + word_count.value * i );
385
cuda_intra_block_reduce_scan<DoScan,FunctorType,ArgTag>( functor , pointer_type(shared_data) );
389
size_type * const shared_value = shared_data + word_count.value * ( threadIdx.y ? threadIdx.y - 1 : blockDim.y );
391
if ( ! threadIdx.y ) { ValueInit::init( functor , shared_value ); }
393
// Join previous inclusive scan value to each member
394
for ( size_type i = b ; i < e ; ++i ) {
395
size_type * const global_value = global_data + word_count.value * i ;
396
ValueJoin::join( functor , shared_value , global_value );
397
ValueOps ::copy( functor , global_value , shared_value );
402
return is_last_block ;
405
// Size in bytes required for inter block reduce or scan
406
template< bool DoScan , class FunctorType , class ArgTag >
408
unsigned cuda_single_inter_block_reduce_scan_shmem( const FunctorType & functor , const unsigned BlockSize )
410
return ( BlockSize + 2 ) * Impl::FunctorValueTraits< FunctorType , ArgTag >::value_size( functor );
414
} // namespace Kokkos
416
//----------------------------------------------------------------------------
417
//----------------------------------------------------------------------------
419
#endif /* #if defined( __CUDACC__ ) */
420
#endif /* KOKKOS_CUDA_REDUCESCAN_HPP */