1
//---------------------------------------------------------------------------
3
// Project: OpenWalnut ( http://www.openwalnut.org )
5
// Copyright 2009 OpenWalnut Community, BSV@Uni-Leipzig and CNCF@MPI-CBS, Copyright 2010 RRZK University of Cologne
6
// For more information see http://www.openwalnut.org/copying
8
// This file is part of OpenWalnut.
10
// OpenWalnut is free software: you can redistribute it and/or modify
11
// it under the terms of the GNU Lesser General Public License as published by
12
// the Free Software Foundation, either version 3 of the License, or
13
// (at your option) any later version.
15
// OpenWalnut is distributed in the hope that it will be useful,
16
// but WITHOUT ANY WARRANTY; without even the implied warranty of
17
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
// GNU Lesser General Public License for more details.
20
// You should have received a copy of the GNU Lesser General Public License
21
// along with OpenWalnut. If not, see <http://www.gnu.org/licenses/>.
23
//---------------------------------------------------------------------------
26
// NOTE: This is based on code from projects/reduction/reduction_kernel.cu from CUDA SDK 2.0
28
#ifndef WCUDAREDUCE_CU
29
#define WCUDAREDUCE_CU
32
* Performs parallel reduction of elements in \e shared memory, the reduction operation is defined by \c Op::eval.
39
* Result of the reduction is stored in s[0]
41
static inline __device__ void apply( volatile float *s, unsigned int n )
43
const unsigned int i = threadIdx.x;
45
const int nthreads = min( 512, 1<<(31-__clz(min(blockDim.x, n))) );
50
for( unsigned int j = nthreads+i; j<n; j += nthreads )
52
s[i] = Op::eval( s[i], s[j] );
60
else if( nthreads == 256 )
62
else if( nthreads == 128 )
64
else if( nthreads == 64 )
66
else if( nthreads == 32 )
68
else if( nthreads == 16 )
70
else if( nthreads == 8 )
72
else if( nthreads == 4 )
74
else if( nthreads == 2 )
79
// This routine actually performs the reduction in shared memory
80
// Result of the reduction is stored in s[0]
81
template< unsigned int threads >
82
static inline __device__ void do_apply( volatile float *s )
84
const unsigned int i = threadIdx.x;
86
if( threads > 512 ) { if( i + 512 < threads ) s[i] = Op::eval( s[i], s[i + 512] ); __syncthreads(); }
87
if( threads > 256 ) { if( i + 256 < threads ) s[i] = Op::eval( s[i], s[i + 256] ); __syncthreads(); }
88
if( threads > 128 ) { if( i + 128 < threads ) s[i] = Op::eval( s[i], s[i + 128] ); __syncthreads(); }
89
if( threads > 64 ) { if( i + 64 < threads ) s[i] = Op::eval( s[i], s[i + 64] ); __syncthreads(); }
93
// all this happens in one warp: no synchronization needed as s is declared volatile
94
if( threads > 32 ) if( i + 32 < threads ) s[i] = Op::eval( s[i], s[i + 32] );
95
if( threads > 16 ) if( i + 16 < threads ) s[i] = Op::eval( s[i], s[i + 16] );
96
if( threads > 8 ) if( i + 8 < threads ) s[i] = Op::eval( s[i], s[i + 8] );
97
if( threads > 4 ) if( i + 4 < threads ) s[i] = Op::eval( s[i], s[i + 4] );
98
if( threads > 2 ) if( i + 2 < threads ) s[i] = Op::eval( s[i], s[i + 2] );
99
if( threads > 1 ) if( i + 1 < threads ) s[i] = Op::eval( s[i], s[i + 1] );
104
#endif // WCUDAREDUCE_CU