83
75
//----------------------------------------------------------------------
85
77
namespace phdmesh {
80
typedef void (*ParallelReduceOp)
81
( void * inv , void * outv , int * , ParallelDatatype * );
84
void all_reduce_internal( ParallelMachine arg_comm ,
85
ParallelReduceOp arg_op ,
87
92
// Blank namespace so that this class produces local symbols,
88
93
// avoiding complaints from a linker of multiple-define symbols.
92
void copyin( WorkType & ) const {}
93
void copyout( WorkType & ) const {}
94
static void op( WorkType & , WorkType & ) {}
97
void copyin( BufferType & ) const {}
98
void copyout( BufferType & ) const {}
99
static void op( BufferType & , BufferType & ) {}
97
102
// Workhorse class for aggregating reduction operations.
99
template <class Op, typename T, class Next>
104
template < class Oper , class Next = ReduceEnd >
106
typename Next::WorkType m_next ;
106
typedef typename Oper::type Type ;
107
enum { N = Oper::N };
111
typename Next::BufferType m_next ;
113
// Copy values into buffer:
114
void copyin( WorkType & w ) const
115
{ Copy<N>( w.m_value , m_value ); m_next.copyin( w.m_next ); }
117
// Copy value out from buffer:
118
void copyout( WorkType & w ) const
119
{ Copy<N>( m_value , w.m_value ); m_next.copyout( w.m_next ); }
121
// Reduction function
122
static void op( WorkType & out , WorkType & in )
123
{ Op( out.m_value , in.m_value ); Next::op( out.m_next , in.m_next ); }
125
// Aggregate reduction operations, use '&' for left-to-right evaluation
126
template<class OpB, typename TB>
127
Reduce<OpB, TB, Reduce<Op,T,Next> >
128
operator & ( const Reduce<OpB,TB,ReduceEnd> & rhs )
129
{ return Reduce<OpB, TB, Reduce<Op,T,Next> >( rhs , *this ); }
131
// Constructor for aggregation:
132
Reduce( const Reduce<Op,T, ReduceEnd> & arg_val , const Next & arg_next )
133
: m_next( arg_next ), m_value( arg_val.m_value ) {}
135
// Constructor for aggregate member:
136
explicit Reduce( Type * arg_value )
137
: m_next(), m_value( arg_value ) {}
117
Next & set( const Oper & arg ) { m_ptr = arg.ptr ; return m_next ; }
119
void reduce( ParallelMachine comm ) const ;
121
void copyin( BufferType & b ) const
122
{ Copy<N>( b.m_value , m_ptr ); m_next.copyin( b.m_next ); }
124
void copyout( BufferType & b ) const
125
{ Copy<N>( m_ptr , b.m_value ); m_next.copyout( b.m_next ); }
127
static void op( BufferType & dst , BufferType & src )
128
{ Oper::op(dst.m_value,src.m_value); Next::op(dst.m_next,src.m_next); }
139
130
static void void_op( void*inv, void*inoutv, int*, ParallelDatatype*);
142
template <class Op, typename T, class Next>
143
void Reduce<Op,T,Next>::void_op( void*inv, void*inoutv,int*,ParallelDatatype*)
145
op( * reinterpret_cast<WorkType*>( inoutv ) ,
146
* reinterpret_cast<WorkType*>( inv ) );
133
template <class Oper, class Next>
134
void Reduce<Oper,Next>::void_op( void*inv, void*inoutv,int*,ParallelDatatype*)
136
op( * reinterpret_cast<BufferType*>( inoutv ) ,
137
* reinterpret_cast<BufferType*>( inv ) );
140
template <class Oper, class Next>
141
void Reduce<Oper,Next>::reduce( ParallelMachine comm ) const
143
ParallelReduceOp f = reinterpret_cast<ParallelReduceOp>( & void_op );
144
BufferType inbuf , outbuf ;
146
all_reduce_internal( comm , f , & inbuf , & outbuf , sizeof(BufferType) );
151
} // namespace phdmesh
152
153
//----------------------------------------------------------------------
153
154
//----------------------------------------------------------------------
155
156
namespace phdmesh {
157
template<unsigned N, typename T>
159
Reduce< Sum<N> , T, ReduceEnd> ReduceSum( T * value )
160
{ return Reduce< Sum<N>, T, ReduceEnd >( value ); }
162
template<unsigned N, typename T>
164
Reduce< Prod<N>, T, ReduceEnd > ReduceProd( T * value )
165
{ return Reduce< Prod<N>, T, ReduceEnd >( value ); }
167
template<unsigned N, typename T>
169
Reduce< Max<N>, T, ReduceEnd> ReduceMax( T * value )
170
{ return Reduce< Max<N>, T, ReduceEnd>( value ); }
172
template<unsigned N, typename T>
174
Reduce< Min<N>, T, ReduceEnd> ReduceMin( T * value )
175
{ return Reduce<Min<N>, T, ReduceEnd>( value ); }
177
template<unsigned N, typename T>
179
Reduce< BitOr<N>, T, ReduceEnd> ReduceBitOr( T * value )
180
{ return Reduce< BitOr<N>, T, ReduceEnd>( value ); }
182
template<unsigned N, typename T>
184
Reduce< BitAnd<N>, T, ReduceEnd> ReduceBitAnd( T * value )
185
{ return Reduce< BitAnd<N>, T, ReduceEnd>( value ); }
187
//----------------------------------------------------------------------
188
// all_reduce( comm , ReduceSum<5>( A ) & ReduceMax<3>( B ) );
191
typedef void (*ParallelReduceOp)
192
( void * inv , void * outv , int * , ParallelDatatype * );
195
void all_reduce( ParallelMachine arg_comm ,
196
ParallelReduceOp arg_op ,
203
template < class ReduceOp >
204
void all_reduce_driver( ParallelMachine comm , const ReduceOp & op )
206
typedef typename ReduceOp::WorkType WorkType ;
208
WorkType inbuf , outbuf ;
211
reinterpret_cast<ParallelReduceOp>( & ReduceOp::void_op );
213
all_reduce( comm , f , & inbuf, & outbuf, sizeof(WorkType) );
214
op.copyout( outbuf );
219
template < class ReduceOp >
221
void all_reduce( ParallelMachine comm , const ReduceOp & op )
222
{ all_reduce_driver<ReduceOp>( comm , op ); }
158
template < class Op1 >
160
void all_reduce( ParallelMachine comm , const Op1 & op1 )
167
template < class Op1 , class Op2 >
169
void all_reduce( ParallelMachine comm , const Op1 & op1 ,
173
Reduce< Op2 > > work ;
174
work.set( op1 ).set( op2 );
178
template < class Op1 , class Op2 , class Op3 >
180
void all_reduce( ParallelMachine comm , const Op1 & op1 ,
186
Reduce< Op3 > > > work ;
187
work.set( op1 ).set( op2 ).set( op3 );
191
template < class Op1 , class Op2 , class Op3 , class Op4 >
193
void all_reduce( ParallelMachine comm , const Op1 & op1 ,
201
Reduce< Op4 > > > > work ;
202
work.set( op1 ).set( op2 ).set( op3 ).set( op4 );
206
template < class Op1 , class Op2 , class Op3 , class Op4 ,
209
void all_reduce( ParallelMachine comm , const Op1 & op1 ,
219
Reduce< Op5 > > > > > work ;
220
work.set( op1 ).set( op2 ).set( op3 ).set( op4 ).set( op5 );
224
template < class Op1 , class Op2 , class Op3 , class Op4 ,
225
class Op5 , class Op6 >
227
void all_reduce( ParallelMachine comm , const Op1 & op1 ,
239
Reduce< Op6 > > > > > > work ;
240
work.set( op1 ).set( op2 ).set( op3 ).set( op4 ).set( op5 ).set( op6 );