3
// ************************************************************************
5
// Kokkos: Manycore Performance-Portable Multidimensional Arrays
6
// Copyright (2012) Sandia Corporation
8
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9
// the U.S. Government retains certain rights in this software.
11
// Redistribution and use in source and binary forms, with or without
12
// modification, are permitted provided that the following conditions are
15
// 1. Redistributions of source code must retain the above copyright
16
// notice, this list of conditions and the following disclaimer.
18
// 2. Redistributions in binary form must reproduce the above copyright
19
// notice, this list of conditions and the following disclaimer in the
20
// documentation and/or other materials provided with the distribution.
22
// 3. Neither the name of the Corporation nor the names of the
23
// contributors may be used to endorse or promote products derived from
24
// this software without specific prior written permission.
26
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
40
// ************************************************************************
44
#ifndef KOKKOS_THREADSEXEC_HPP
45
#define KOKKOS_THREADSEXEC_HPP
50
#include <impl/Kokkos_spinwait.hpp>
51
#include <impl/Kokkos_FunctorAdapter.hpp>
53
#include <Kokkos_Atomic.hpp>
55
//----------------------------------------------------------------------------
60
//----------------------------------------------------------------------------
62
template< class > struct ThreadsExecAdapter ;
64
//----------------------------------------------------------------------------
66
class ThreadsExecTeamMember ;
71
// Fan array has log_2(NT) reduction threads plus 2 scan threads
72
// Currently limited to 16k threads.
73
enum { MAX_FAN_COUNT = 16 };
74
enum { MAX_THREAD_COUNT = 1 << ( MAX_FAN_COUNT - 2 ) };
75
enum { VECTOR_LENGTH = 8 };
77
/** \brief States of a worker thread */
78
enum { Terminating ///< Termination in progress
79
, Inactive ///< Exists, waiting for work
80
, Active ///< Exists, performing work
81
, Rendezvous ///< Exists, waiting in a barrier or reduce
90
friend class ThreadsExecTeamMember ;
91
friend class ThreadsExecTeamVectorMember ;
92
friend class Kokkos::Threads ;
94
// Fan-in operations' root is the highest ranking thread
95
// to place the 'scan' reduction intermediate values on
96
// the threads that need them.
97
// For a simple reduction the thread location is arbitrary.
99
/** \brief Reduction memory reserved for team reductions */
100
enum { REDUCE_TEAM_BASE = 512 };
102
ThreadsExec * const * m_pool_base ; ///< Base for pool fan-in
105
int m_scratch_reduce_end ;
106
int m_scratch_thread_end ;
109
int m_pool_fan_size ;
110
int volatile m_pool_state ; ///< State for global synchronizations
113
static void global_lock();
114
static void global_unlock();
117
static void execute_resize_scratch( ThreadsExec & , const void * );
118
static void execute_sleep( ThreadsExec & , const void * );
119
static void execute_get_binding( ThreadsExec & , const void * );
121
ThreadsExec( const ThreadsExec & );
122
ThreadsExec & operator = ( const ThreadsExec & );
124
static void execute_serial( void (*)( ThreadsExec & , const void * ) );
128
KOKKOS_INLINE_FUNCTION int pool_size() const { return m_pool_size ; }
129
KOKKOS_INLINE_FUNCTION int pool_rank() const { return m_pool_rank ; }
131
static int get_thread_count();
132
static ThreadsExec * get_thread( const int init_thread_rank );
134
inline void * reduce_memory() const { return ((unsigned char *) m_scratch ); }
135
KOKKOS_INLINE_FUNCTION void * scratch_memory() const { return ((unsigned char *) m_scratch ) + m_scratch_reduce_end ; }
137
static void driver(void);
142
static void * resize_scratch( size_t reduce_size , size_t thread_size );
144
static void * root_reduce_scratch();
146
static bool is_process();
148
static void verify_is_process( const std::string & , const bool initialized );
150
static int is_initialized();
152
static void initialize( unsigned thread_count ,
153
unsigned use_numa_count ,
154
unsigned use_cores_per_numa ,
155
bool allow_asynchronous_threadpool );
157
static void finalize();
159
/* Given a requested team size, return valid team size */
160
static unsigned team_size_valid( unsigned );
162
static void print_configuration( std::ostream & , const bool detail = false );
164
//------------------------------------
166
static void wait_yield( volatile int & , const int );
168
//------------------------------------
169
// All-thread functions:
171
template< class FunctorType , class ArgTag >
173
void fan_in_reduce( const FunctorType & f ) const
175
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > Join ;
176
typedef Kokkos::Impl::FunctorFinal< FunctorType , ArgTag > Final ;
178
const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
180
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
182
ThreadsExec & fan = *m_pool_base[ rev_rank + ( 1 << i ) ] ;
184
Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
186
Join::join( f , reduce_memory() , fan.reduce_memory() );
190
Final::final( f , reduce_memory() );
197
const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
199
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
200
Impl::spinwait( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
204
template< class FunctorType , class ArgTag >
206
void scan_large( const FunctorType & f )
208
// Sequence of states:
209
// 0) Active : entry and exit state
210
// 1) ReductionAvailable : reduction value available
211
// 2) ScanAvailable : inclusive scan value available
212
// 3) Rendezvous : All threads inclusive scan value are available
213
// 4) ScanCompleted : exclusive scan value copied
215
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , ArgTag > Traits ;
216
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > Join ;
217
typedef Kokkos::Impl::FunctorValueInit< FunctorType , ArgTag > Init ;
219
typedef typename Traits::value_type scalar_type ;
221
const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
222
const unsigned count = Traits::value_count( f );
224
scalar_type * const work_value = (scalar_type *) reduce_memory();
226
//--------------------------------
227
// Fan-in reduction with highest ranking thread as the root
228
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
229
ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
231
// Wait: Active -> ReductionAvailable (or ScanAvailable)
232
Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
233
Join::join( f , work_value , fan.reduce_memory() );
236
// Copy reduction value to scan value before releasing from this phase.
237
for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i] ; }
241
// Set: Active -> ReductionAvailable
242
m_pool_state = ThreadsExec::ReductionAvailable ;
244
// Wait for contributing threads' scan value to be available.
245
if ( ( 1 << m_pool_fan_size ) < ( m_pool_rank + 1 ) ) {
246
ThreadsExec & th = *m_pool_base[ rev_rank + ( 1 << m_pool_fan_size ) ] ;
248
// Wait: Active -> ReductionAvailable
249
// Wait: ReductionAvailable -> ScanAvailable
250
Impl::spinwait( th.m_pool_state , ThreadsExec::Active );
251
Impl::spinwait( th.m_pool_state , ThreadsExec::ReductionAvailable );
253
Join::join( f , work_value + count , ((scalar_type *)th.reduce_memory()) + count );
256
// This thread has completed inclusive scan
257
// Set: ReductionAvailable -> ScanAvailable
258
m_pool_state = ThreadsExec::ScanAvailable ;
260
// Wait for all threads to complete inclusive scan
261
// Wait: ScanAvailable -> Rendezvous
262
Impl::spinwait( m_pool_state , ThreadsExec::ScanAvailable );
265
//--------------------------------
267
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
268
ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
269
// Wait: ReductionAvailable -> ScanAvailable
270
Impl::spinwait( fan.m_pool_state , ThreadsExec::ReductionAvailable );
271
// Set: ScanAvailable -> Rendezvous
272
fan.m_pool_state = ThreadsExec::Rendezvous ;
275
// All threads have completed the inclusive scan.
276
// All non-root threads are in the Rendezvous state.
277
// Threads are free to overwrite their reduction value.
278
//--------------------------------
280
if ( ( rev_rank + 1 ) < m_pool_size ) {
281
// Exclusive scan: copy the previous thread's inclusive scan value
283
ThreadsExec & th = *m_pool_base[ rev_rank + 1 ] ; // Not the root thread
285
const scalar_type * const src_value = ((scalar_type *)th.reduce_memory()) + count ;
287
for ( unsigned j = 0 ; j < count ; ++j ) { work_value[j] = src_value[j]; }
290
(void) Init::init( f , work_value );
293
//--------------------------------
294
// Wait for all threads to copy previous thread's inclusive scan value
295
// Wait for all threads: Rendezvous -> ScanCompleted
296
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
297
Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
300
// Set: ScanAvailable -> ScanCompleted
301
m_pool_state = ThreadsExec::ScanCompleted ;
302
// Wait: ScanCompleted -> Active
303
Impl::spinwait( m_pool_state , ThreadsExec::ScanCompleted );
305
// Set: ScanCompleted -> Active
306
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
307
m_pool_base[ rev_rank + (1<<i) ]->m_pool_state = ThreadsExec::Active ;
311
template< class FunctorType , class ArgTag >
313
void scan_small( const FunctorType & f )
315
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , ArgTag > Traits ;
316
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > Join ;
317
typedef Kokkos::Impl::FunctorValueInit< FunctorType , ArgTag > Init ;
319
typedef typename Traits::value_type scalar_type ;
321
const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
322
const unsigned count = Traits::value_count( f );
324
scalar_type * const work_value = (scalar_type *) reduce_memory();
326
//--------------------------------
327
// Fan-in reduction with highest ranking thread as the root
328
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
329
// Wait: Active -> Rendezvous
330
Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
333
for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i]; }
336
m_pool_state = ThreadsExec::Rendezvous ;
337
// Wait: Rendezvous -> Active
338
Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
341
// Root thread does the thread-scan before releasing threads
343
scalar_type * ptr_prev = 0 ;
345
for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
346
scalar_type * const ptr = (scalar_type *) get_thread( rank )->reduce_memory();
348
for ( unsigned i = 0 ; i < count ; ++i ) { ptr[i] = ptr_prev[ i + count ]; }
349
Join::join( f , ptr + count , ptr );
352
(void) Init::init( f , ptr );
358
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
359
m_pool_base[ rev_rank + (1<<i) ]->m_pool_state = ThreadsExec::Active ;
363
//------------------------------------
364
/** \brief Wait for previous asynchronous functor to
365
* complete and release the Threads device.
366
* Acquire the Threads device and start this functor.
368
static void start( void (*)( ThreadsExec & , const void * ) , const void * );
370
static int in_parallel();
376
//----------------------------------------------------------------------------
377
//----------------------------------------------------------------------------
379
class ThreadsExecTeamMember {
382
enum { TEAM_REDUCE_SIZE = 512 };
384
typedef Kokkos::Threads execution_space ;
385
typedef execution_space::scratch_memory_space space ;
387
Impl::ThreadsExec & m_exec ;
388
space m_team_shared ;
389
ThreadsExec * const * m_team_base ; ///< Base for team fan-in
390
int m_team_shared_size ;
393
int m_team_rank_rev ;
399
void set_team_shared()
400
{ new( & m_team_shared ) space( ((char *) (*m_team_base)->scratch_memory()) + TEAM_REDUCE_SIZE , m_team_shared_size ); }
402
// Fan-in and wait until the matching fan-out is called.
403
// The root thread which does not wait will return true.
404
// All other threads will return false during the fan-out.
405
KOKKOS_INLINE_FUNCTION bool team_fan_in() const
409
// Wait for fan-in threads
410
for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
411
Impl::spinwait( m_team_base[j]->m_pool_state , ThreadsExec::Active );
414
// If not root then wait for release
415
if ( m_team_rank_rev ) {
416
m_exec.m_pool_state = ThreadsExec::Rendezvous ;
417
Impl::spinwait( m_exec.m_pool_state , ThreadsExec::Rendezvous );
420
return ! m_team_rank_rev ;
423
KOKKOS_INLINE_FUNCTION void team_fan_out() const
426
for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
427
m_team_base[j]->m_pool_state = ThreadsExec::Active ;
433
KOKKOS_INLINE_FUNCTION static int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
435
KOKKOS_INLINE_FUNCTION
436
const execution_space::scratch_memory_space & team_shmem() const
437
{ return m_team_shared ; }
439
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
440
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
441
KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
442
KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
444
KOKKOS_INLINE_FUNCTION void team_barrier() const
450
template<class ValueType>
451
KOKKOS_INLINE_FUNCTION
452
void team_broadcast(ValueType& value, const int& thread_id) const
454
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
457
// Make sure there is enough scratch space:
458
typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE
459
, ValueType , void >::type type ;
461
type * const local_value = ((type*) m_exec.scratch_memory());
462
if(team_rank() == thread_id)
463
*local_value = value;
466
value = *local_value;
470
template< typename Type >
471
KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
472
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
476
// Make sure there is enough scratch space:
477
typedef typename if_c< sizeof(Type) < ThreadsExec::REDUCE_TEAM_BASE , Type , void >::type type ;
479
*((volatile type*) m_exec.scratch_memory() ) = value ;
483
type & accum = *((type *) m_team_base[0]->scratch_memory() );
485
if ( team_fan_in() ) {
486
for ( int i = 1 ; i < m_team_size ; ++i ) {
487
accum += *((type *) m_team_base[i]->scratch_memory() );
498
#ifdef KOKKOS_HAVE_CXX11
499
template< class ValueType, class JoinOp >
500
KOKKOS_INLINE_FUNCTION ValueType
501
team_reduce( const ValueType & value
502
, const JoinOp & op_in ) const
503
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
504
{ return ValueType(); }
507
typedef ValueType value_type;
508
const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
510
#else // KOKKOS_HAVE_CXX11
511
template< class JoinOp >
512
KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
513
team_reduce( const typename JoinOp::value_type & value
514
, const JoinOp & op ) const
515
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
516
{ return typename JoinOp::value_type(); }
519
typedef typename JoinOp::value_type value_type;
521
#endif // KOKKOS_HAVE_CXX11
522
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
523
// Make sure there is enough scratch space:
524
typedef typename if_c< sizeof(value_type) < ThreadsExec::REDUCE_TEAM_BASE
525
, value_type , void >::type type ;
527
type * const local_value = ((type*) m_exec.scratch_memory());
529
// Set this thread's contribution
530
*local_value = value ;
532
// Fence to make sure the base team member has access:
535
if ( team_fan_in() ) {
536
// The last thread to synchronize returns true, all other threads wait for team_fan_out()
537
type * const team_value = ((type*) m_team_base[0]->scratch_memory());
539
// Join to the team value:
540
for ( int i = 1 ; i < m_team_size ; ++i ) {
541
op.join( *team_value , *((type*) m_team_base[i]->scratch_memory()) );
544
// Team base thread may "lap" member threads so copy out to their local value.
545
for ( int i = 1 ; i < m_team_size ; ++i ) {
546
*((type*) m_team_base[i]->scratch_memory()) = *team_value ;
549
// Fence to make sure all team members have access
555
// Value was changed by the team base
556
return *((type volatile const *) local_value);
560
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
561
* with intra-team non-deterministic ordering accumulation.
563
* The global inter-team accumulation value will, at the end of the
564
* league's parallel execution, be the scan's total.
565
* Parallel execution ordering of the league's teams is non-deterministic.
566
* As such the base value for each team's scan operation is similarly
569
template< typename ArgType >
570
KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const
571
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
572
{ return ArgType(); }
575
// Make sure there is enough scratch space:
576
typedef typename if_c< sizeof(ArgType) < ThreadsExec::REDUCE_TEAM_BASE , ArgType , void >::type type ;
578
volatile type * const work_value = ((type*) m_exec.scratch_memory());
580
*work_value = value ;
584
if ( team_fan_in() ) {
585
// The last thread to synchronize returns true, all other threads wait for team_fan_out()
586
// m_team_base[0] == highest ranking team member
587
// m_team_base[ m_team_size - 1 ] == lowest ranking team member
589
// 1) copy from lower to higher rank, initialize lowest rank to zero
590
// 2) prefix sum from lowest to highest rank, skipping lowest rank
594
if ( global_accum ) {
595
for ( int i = m_team_size ; i-- ; ) {
596
type & val = *((type*) m_team_base[i]->scratch_memory());
599
accum = atomic_fetch_add( global_accum , accum );
602
for ( int i = m_team_size ; i-- ; ) {
603
type & val = *((type*) m_team_base[i]->scratch_memory());
604
const type offset = accum ;
618
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
620
* The highest rank thread can compute the reduction total as
621
* reduction_total = dev.team_scan( value ) + value ;
623
template< typename ArgType >
624
KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value ) const
625
{ return this-> template team_scan<ArgType>( value , 0 ); }
627
#ifdef KOKKOS_HAVE_CXX11
629
/** \brief Inter-thread parallel for. Executes op(iType i) for each i=0..N-1.
631
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
632
* This functionality requires C++11 support.*/
633
template< typename iType, class Operation>
634
KOKKOS_INLINE_FUNCTION void team_par_for(const iType n, const Operation & op) const {
635
const int chunk = ((n+m_team_size-1)/m_team_size);
636
const int start = chunk*m_team_rank;
637
const int end = start+chunk<n?start+chunk:n;
638
for(int i=start; i<end ; i++) {
643
//----------------------------------------
644
// Private for the driver
646
template< class Arg0 , class Arg1 >
647
ThreadsExecTeamMember( Impl::ThreadsExec & exec
648
, const TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > & team
649
, const int shared_size )
653
, m_team_shared_size( shared_size )
661
if ( team.league_size() ) {
662
// Execution is using device-team interface:
664
const int pool_rank_rev = exec.pool_size() - ( exec.pool_rank() + 1 );
665
const int team_rank_rev = pool_rank_rev % team.team_alloc();
667
// May be using fewer threads per team than a multiple of threads per core,
668
// some threads will idle.
670
if ( team_rank_rev < team.team_size() ) {
671
const size_t pool_league_size = exec.pool_size() / team.team_alloc() ;
672
const size_t pool_league_rank_rev = pool_rank_rev / team.team_alloc() ;
673
const size_t pool_league_rank = pool_league_size - ( pool_league_rank_rev + 1 );
675
m_team_base = exec.m_pool_base + team.team_alloc() * pool_league_rank_rev ;
676
m_team_size = team.team_size() ;
677
m_team_rank = team.team_size() - ( team_rank_rev + 1 );
678
m_team_rank_rev = team_rank_rev ;
679
m_league_size = team.league_size();
680
m_league_rank = ( team.league_size() * pool_league_rank ) / pool_league_size ;
681
m_league_end = ( team.league_size() * (pool_league_rank+1) ) / pool_league_size ;
689
{ return m_league_rank < m_league_end ; }
693
if ( ++m_league_rank < m_league_end ) {
699
} /* namespace Impl */
700
} /* namespace Kokkos */
702
//----------------------------------------------------------------------------
703
//----------------------------------------------------------------------------
707
inline int Threads::in_parallel()
708
{ return Impl::ThreadsExec::in_parallel(); }
710
inline int Threads::is_initialized()
711
{ return Impl::ThreadsExec::is_initialized(); }
713
inline void Threads::initialize(
714
unsigned threads_count ,
715
unsigned use_numa_count ,
716
unsigned use_cores_per_numa ,
717
bool allow_asynchronous_threadpool )
719
Impl::ThreadsExec::initialize( threads_count , use_numa_count , use_cores_per_numa , allow_asynchronous_threadpool );
722
inline void Threads::finalize()
724
Impl::ThreadsExec::finalize();
727
inline void Threads::print_configuration( std::ostream & s , const bool detail )
729
Impl::ThreadsExec::print_configuration( s , detail );
732
inline bool Threads::sleep()
733
{ return Impl::ThreadsExec::sleep() ; }
735
inline bool Threads::wake()
736
{ return Impl::ThreadsExec::wake() ; }
738
inline void Threads::fence()
739
{ Impl::ThreadsExec::fence() ; }
741
} /* namespace Kokkos */
743
//----------------------------------------------------------------------------
744
//----------------------------------------------------------------------------
748
template< class Arg0 , class Arg1 >
749
class TeamPolicy< Arg0 , Arg1 , Kokkos::Threads >
758
void init( const int league_size_request
759
, const int team_size_request )
761
const int pool_size = execution_space::thread_pool_size(0);
762
const int team_max = execution_space::thread_pool_size(1);
763
const int team_grain = execution_space::thread_pool_size(2);
765
m_league_size = league_size_request ;
767
m_team_size = team_size_request < team_max ?
768
team_size_request : team_max ;
770
// Round team size up to a multiple of 'team_gain'
771
const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain );
772
const int team_count = pool_size / team_size_grain ;
774
// Constraint : pool_size = m_team_alloc * team_count
775
m_team_alloc = pool_size / team_count ;
781
//! Tag this class as a kokkos execution policy
782
typedef TeamPolicy execution_policy ;
783
typedef Kokkos::Threads execution_space ;
786
Impl::if_c< ! Impl::is_same< Kokkos::Threads , Arg0 >::value , Arg0 , Arg1 >::type
789
//----------------------------------------
791
template< class FunctorType >
793
int team_size_max( const FunctorType & )
794
{ return execution_space::thread_pool_size(1); }
796
template< class FunctorType >
797
static int team_size_recommended( const FunctorType & )
798
{ return execution_space::thread_pool_size(2); }
800
//----------------------------------------
802
inline int team_size() const { return m_team_size ; }
803
inline int team_alloc() const { return m_team_alloc ; }
804
inline int league_size() const { return m_league_size ; }
806
/** \brief Specify league size, request team size */
807
TeamPolicy( execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1 )
811
{ init(league_size_request,team_size_request); (void) vector_length_request; }
813
TeamPolicy( int league_size_request , int team_size_request , int vector_length_request = 1 )
817
{ init(league_size_request,team_size_request); (void) vector_length_request; }
819
typedef Impl::ThreadsExecTeamMember member_type ;
821
friend class Impl::ThreadsExecTeamMember ;
825
} /* namespace Kokkos */
828
#ifdef KOKKOS_HAVE_CXX11
832
template<typename iType>
833
KOKKOS_INLINE_FUNCTION
834
Impl::TeamThreadLoopBoundariesStruct<iType,Impl::ThreadsExecTeamMember>
835
TeamThreadLoop(const Impl::ThreadsExecTeamMember& thread, const iType& count) {
836
return Impl::TeamThreadLoopBoundariesStruct<iType,Impl::ThreadsExecTeamMember>(thread,count);
839
template<typename iType>
840
KOKKOS_INLINE_FUNCTION
841
Impl::ThreadVectorLoopBoundariesStruct<iType,Impl::ThreadsExecTeamMember >
842
ThreadVectorLoop(const Impl::ThreadsExecTeamMember& thread, const iType& count) {
843
return Impl::ThreadVectorLoopBoundariesStruct<iType,Impl::ThreadsExecTeamMember >(thread,count);
847
KOKKOS_INLINE_FUNCTION
848
Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember> PerTeam(const Impl::ThreadsExecTeamMember& thread) {
849
return Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>(thread);
852
KOKKOS_INLINE_FUNCTION
853
Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember> PerThread(const Impl::ThreadsExecTeamMember& thread) {
854
return Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember>(thread);
856
} // namespace Kokkos
860
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
862
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
863
* This functionality requires C++11 support.*/
864
template<typename iType, class Lambda>
865
KOKKOS_INLINE_FUNCTION
866
void parallel_for(const Impl::TeamThreadLoopBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda) {
867
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
871
/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
873
* The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
874
* val is performed and put into result. This functionality requires C++11 support.*/
875
template< typename iType, class Lambda, typename ValueType >
876
KOKKOS_INLINE_FUNCTION
877
void parallel_reduce(const Impl::TeamThreadLoopBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
878
const Lambda & lambda, ValueType& result) {
880
result = ValueType();
882
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
883
ValueType tmp = ValueType();
888
result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
891
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
893
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
894
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
895
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
896
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
897
* '1 for *'). This functionality requires C++11 support.*/
898
template< typename iType, class Lambda, typename ValueType, class JoinType >
899
KOKKOS_INLINE_FUNCTION
900
void parallel_reduce(const Impl::TeamThreadLoopBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
901
const Lambda & lambda, const JoinType& join, ValueType& init_result) {
903
ValueType result = init_result;
905
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
906
ValueType tmp = ValueType();
911
init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join));
918
/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
920
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
921
* This functionality requires C++11 support.*/
922
template<typename iType, class Lambda>
923
KOKKOS_INLINE_FUNCTION
924
void parallel_for(const Impl::ThreadVectorLoopBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
925
loop_boundaries, const Lambda& lambda) {
926
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
929
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
933
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
935
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
936
* val is performed and put into result. This functionality requires C++11 support.*/
937
template< typename iType, class Lambda, typename ValueType >
938
KOKKOS_INLINE_FUNCTION
939
void parallel_reduce(const Impl::ThreadVectorLoopBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
940
loop_boundaries, const Lambda & lambda, ValueType& result) {
941
result = ValueType();
942
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
945
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
946
ValueType tmp = ValueType();
952
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
954
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
955
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
956
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
957
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
958
* '1 for *'). This functionality requires C++11 support.*/
959
template< typename iType, class Lambda, typename ValueType, class JoinType >
960
KOKKOS_INLINE_FUNCTION
961
void parallel_reduce(const Impl::ThreadVectorLoopBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
962
loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
964
ValueType result = init_result;
965
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
968
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
969
ValueType tmp = ValueType();
973
init_result = result;
976
/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
979
* The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
980
* Depending on the target execution space the operator might be called twice: once with final=false
981
* and once with final=true. When final==true val contains the prefix sum value. The contribution of this
982
* "i" needs to be added to val no matter whether final==true or not. In a serial execution
983
* (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
984
* to the final sum value over all vector lanes.
985
* This functionality requires C++11 support.*/
986
template< typename iType, class FunctorType >
987
KOKKOS_INLINE_FUNCTION
988
void parallel_scan(const Impl::ThreadVectorLoopBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
989
loop_boundaries, const FunctorType & lambda) {
991
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
992
typedef typename ValueTraits::value_type value_type ;
994
value_type scan_val = value_type();
996
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
999
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
1000
lambda(i,scan_val,true);
1004
} // namespace Kokkos
1008
template<class FunctorType>
1009
KOKKOS_INLINE_FUNCTION
1010
void single(const Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda) {
1014
template<class FunctorType>
1015
KOKKOS_INLINE_FUNCTION
1016
void single(const Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda) {
1017
if(single_struct.team_member.team_rank()==0) lambda();
1020
template<class FunctorType, class ValueType>
1021
KOKKOS_INLINE_FUNCTION
1022
void single(const Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
1026
template<class FunctorType, class ValueType>
1027
KOKKOS_INLINE_FUNCTION
1028
void single(const Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
1029
if(single_struct.team_member.team_rank()==0) {
1032
single_struct.team_member.team_broadcast(val,0);
1035
#endif // KOKKOS_HAVE_CXX11
1037
//----------------------------------------------------------------------------
1038
//----------------------------------------------------------------------------
1040
#endif /* #define KOKKOS_THREADSEXEC_HPP */