5
/* $Id: message.c,v 1.58.6.4 2007-04-24 10:08:26 vinod Exp $ */
27
#ifdef _POSIX_PRIORITY_SCHEDULING
36
#if defined(SYSV) || defined(MMAP) ||defined (WIN32)
40
/* global operations are use buffer size of BUF_SIZE doubles */
41
#define BUF_SIZE (4*2048)
42
#define INFO_BUF_SIZE (BUF_SIZE*sizeof(BUF_SIZE) - sizeof(double))
47
static double *work=NULL;
48
static long *lwork = NULL;
49
static long long *llwork = NULL;
50
static int *iwork = NULL;
51
static float *fwork = NULL;
52
static int _armci_gop_init=0; /* tells us if we have a buffers allocated */
53
static int _armci_gop_shmem =0; /* tells us to use shared memory for gops */
54
extern void armci_util_wait_int(volatile int *, int , int );
55
static int empty=EMPTY,full=FULL;
56
#if !defined(SGIALTIX) && defined(SYSV) || defined(MMAP) || defined(WIN32)
57
static void **ptr_arr=NULL;
69
double array[BUF_SIZE];
72
static bufstruct *_gop_buffer;
74
#define GOP_BUF(p) (_gop_buffer+((p)-armci_master))
76
/*\ macro to set a flag includes mem barrier to assure that flag is not set
77
* before any outstanding writes complete
81
# define SET_SHM_FLAG(_flg,_val) _clear_lock((int *)(_flg),_val);
83
# define SET_SHM_FLAG(_flg,_val) MEM_FENCE; *(_flg)=(_val)
84
# elif defined(__ia64)
85
# if defined(__GNUC__) && !defined (__INTEL_COMPILER)
86
# define SET_SHM_FLAG(_flg,_val)\
87
__asm__ __volatile__ ("mf" ::: "memory"); *(_flg)=(_val)
88
# else /* Intel Compiler */
89
extern void _armci_ia64_mb();
90
# define SET_SHM_FLAG(_flg,_val)\
91
_armci_ia64_mb(); *(_flg)=(_val);
94
# if defined(__GNUC__)
95
# define SET_SHM_FLAG(_flg,_val)\
96
*(_flg)=(_val);__asm__ __volatile__ ("isync" ::: "memory")
102
# define SET_SHM_FLAG(_flg,_val) *(_flg)=_val;
108
* Variables/structures for use in Barrier and for Binomial tree
114
static int LnB=0,powof2nodes,Lp2;
120
barrier_struct *_bar_buff;
121
#define BAR_BUF(p) (_bar_buff+((p)))
122
void **barr_snd_ptr,**barr_rcv_ptr;
123
int _armci_barrier_init=0;
124
int _armci_barrier_shmem=0;
128
* Tree generation code
130
static void _dfs_bintree_parse(int *idlist, int index, int max, int *result)
132
int left = (int)2*index+1;
133
int right = (int) 2*index+2;
137
result[pos++]=idlist[index];
139
_dfs_bintree_parse(idlist,left,max,result);
143
_dfs_bintree_parse(idlist,right,max,result);
145
result[pos++]=idlist[index];
147
static int tree_unique_id=0;
148
int armci_msg_generate_tree(int *idlist,int idlen,int *id_tree,int TREE)
151
/*for now everything is binary tree*/
152
_dfs_bintree_parse(idlist,0,idlen,id_tree);
153
return tree_unique_id++;
157
* *************************************************************
160
char *mp_group_name = (char *)NULL;
162
char *mp_group_name = "mp_working_group";
166
static void _allocate_mem_for_work(){
167
work = (double *)malloc(sizeof(double)*BUF_SIZE);
168
if(!work)armci_die("malloc in _allocate_mem_for_work failed",0);
169
lwork = (long *)work; iwork = (int *)work; fwork = (float *)work;
170
llwork = (long long *)work;
174
static void _deallocate_mem_for_work(){
175
if (work) free(work);
176
work = NULL; lwork = NULL; iwork = NULL; fwork = NULL; llwork = NULL;
180
/*\ allocate and initialize buffers used for collective communication
182
void armci_msg_gop_init()
184
/*work was a static global array of doubles. It has been changed to get
185
memory from malloc because of a problem with cc on SV1
187
if(work==NULL)_allocate_mem_for_work();
188
#if !defined(SGIALTIX) && defined(SYSV) || defined(MMAP) || defined(WIN32)
189
if(ARMCI_Uses_shm()){
192
int size = sizeof(bufstruct);
193
int bytes = size * armci_clus_info[armci_clus_me].nslave;
195
void armci_msg_barr_init();
196
armci_msg_barr_init();
198
bytes += size*2; /* extra for brdcst */
200
ptr_arr = (void**)malloc(armci_nproc*sizeof(void*));
201
if(armci_me==armci_master) bytes += 128;
203
PARMCI_Malloc(ptr_arr, bytes);
204
tmp = (char*)ptr_arr[armci_master];
207
printf("%d: allocate gop buffer %p %d\n",armci_me,tmp,bytes);
211
if(!tmp) armci_die("armci_msg_init: shm malloc failed\n",size);
212
_gop_buffer = ( bufstruct *) tmp;
213
work = GOP_BUF(armci_me)->array; /* each process finds its place */
214
GOP_BUF(armci_me)->a.flag=EMPTY; /* initially buffer is empty */
215
GOP_BUF(armci_me)->b.flag=EMPTY; /* initially buffer is empty */
216
if(armci_me == armci_master ){
217
GOP_BUF(armci_clus_last+1)->a.flag=EMPTY;/*initially buffer is empty*/
218
GOP_BUF(armci_clus_last+2)->a.flag=EMPTY;/*initially buffer is empty*/
219
GOP_BUF(armci_clus_last+1)->b.flag=EMPTY;/*initially buffer is empty*/
220
GOP_BUF(armci_clus_last+2)->b.flag=EMPTY;/*initially buffer is empty*/
222
_armci_gop_shmem = 1;
225
/*stuff needed for barrier and binomial bcast/reduce*/
227
if(!_armci_barrier_shmem){
228
int size = 2*sizeof(int);
229
/*allocate memory to send/rcv data*/
230
barr_snd_ptr = (void **)malloc(sizeof(void *)*armci_nproc);
231
barr_rcv_ptr = (void **)malloc(sizeof(void *)*armci_nproc);
233
if(PARMCI_Malloc(barr_snd_ptr,size))armci_die("malloc barrinit failed",0);
234
if(PARMCI_Malloc(barr_rcv_ptr,size))armci_die("malloc barrinit failed",0);
235
if(barr_rcv_ptr[armci_me]==NULL || barr_snd_ptr[armci_me]==NULL)
236
armci_die("problems in malloc barr_init",0);
238
LnB = floor(log(armci_nclus)/log(2))+1;
239
if(pow(2,LnB-1)<armci_nclus){powof2nodes=0;}
240
/*Lp2 is the largest pow-of-2 less than or equal to nclus(num of nodes)*/
242
_armci_barrier_init = 1;
244
/****************************************************/
250
void armci_msg_gop_finalize()
252
#if !defined(SGIALTIX) && defined(SYSV) || defined(MMAP) || defined(WIN32)
253
if(ARMCI_Uses_shm()){
254
PARMCI_Free(ptr_arr[armci_me]);
258
_deallocate_mem_for_work();
264
#if defined(SYSV) || defined(MMAP) || defined(WIN32)
269
#elif defined(_POSIX_PRIORITY_SCHEDULING)
278
* n represents number of time delay units
279
* notused is useful to fool compiler by passing address of sensitive variable
281
#define DUMMY_INIT 1.0001
282
double _armci_dummy_work=DUMMY_INIT;
283
void armci_util_spin(int n, void *notused)
287
if(armci_msg_me()>-1) _armci_dummy_work *=DUMMY_INIT;
288
if(_armci_dummy_work>(double)armci_msg_nproc())_armci_dummy_work=DUMMY_INIT;
292
/***************************Barrier Code*************************************/
294
void armci_msg_barr_init(){
295
#if defined(SYSV) || defined(MMAP) || defined(WIN32)
296
int size=sizeof(barrier_struct)*armci_clus_info[armci_clus_me].nslave;
300
/*First allocate space for flags*/
302
ptr_arr = (void**)malloc(armci_nproc*sizeof(void*));
303
if(armci_me==armci_master) size = size+128;
305
PARMCI_Malloc(ptr_arr, size);
306
tmp = (char*)ptr_arr[armci_master];
309
if(!tmp)armci_die("allocate barr shm failed",0);
310
_bar_buff=(barrier_struct *)tmp;
312
SET_SHM_FLAG(&(BAR_BUF(armci_me-armci_master)->flag1),empty);
313
SET_SHM_FLAG(&(BAR_BUF(armci_me-armci_master)->flag2),empty);
315
/*allocate memory to send/rcv data*/
316
barr_snd_ptr = (void **)malloc(sizeof(void *)*armci_nproc);
317
barr_rcv_ptr = (void **)malloc(sizeof(void *)*armci_nproc);
319
if(PARMCI_Malloc(barr_snd_ptr,size))armci_die("malloc barr_init failed",0);
320
if(PARMCI_Malloc(barr_rcv_ptr,size))armci_die("malloc barr_init failed",0);
321
if(barr_rcv_ptr[armci_me]==NULL || barr_snd_ptr[armci_me]==NULL)
322
armci_die("problems in malloc barr_init",0);
324
/*we have to figure if we have power of ,two nodes*/
326
LnB = (int)floor(log(armci_nclus)/log(2))+1;
327
if(pow(2,LnB-1)<armci_nclus){powof2nodes=0;}
328
/*Lp2 is the largest pow-of-2 less than or equal to nclus(num of nodes)*/
329
Lp2 = (int)pow(2,LnB);
330
_armci_barrier_shmem = 1;
332
_armci_barrier_init = 1;
336
*armci barrier: implemented as a recursive doubling based pairwise exchange
337
*algorithm with SMP barrier inside a node and msg_snd/rcv between the nodes.
338
*NOTE::code for power or two nodes and non power of two nodes can be combined.
341
static void _armci_msg_barrier(){
342
int next_node,next,i;
344
int nslave = armci_clus_info[armci_clus_me].nslave;
345
static int barr_count = 0;
346
int last, next_nodel=0;
347
void armci_util_wait_int(volatile int *,int,int);
348
/*if(barr_count==0)armci_msg_barr_init();*/
350
if(armci_me==armci_master){ /*only masters do the intenode barrier*/
351
for(i=1;i<nslave;i++){ /*wait for all smp procs to enter the barrier*/
352
armci_util_wait_int(&BAR_BUF(i)->flag1,FULL,100000);
353
SET_SHM_FLAG(&(BAR_BUF(i)->flag1),empty);
356
last = ((int)pow(2,(LnB-1)))^armci_clus_me;
357
if(last>=0 && last<armci_nclus)
358
next_nodel = armci_clus_info[last].master;
359
/*three step exchange if num of nodes is not pow of 2*/
360
/*divide _nodes_ into two sets, first set "pow2" will have a power of
361
*two nodes, the second set "not-pow2" will have the remaining.
362
*Each node in the not-pow2 set will have a pair node in the pow2 set.
363
*Step-1:each node in pow2 set with a pair in not-pow2 set first recvs
364
* :a message from its pair in not-pow2.
365
*step-2:All nodes in pow2 do a Rercusive Doubling based Pairwise exng.
366
*step-3:Each node in pow2 with a pair in not-pow2 snds msg to its
368
*if num of nodes a pow of 2, only step 2 executed
370
if(last>armci_clus_me){ /*the pow2 set of procs*/
371
if(last<armci_nclus && !powof2nodes){ /*step 1*/
372
dstn = (char *)barr_rcv_ptr[next_nodel];
373
armci_msg_rcv(ARMCI_TAG, dstn,4,NULL,next_nodel);
374
armci_util_wait_int((volatile int *)dstn,barr_count,100000);
376
for(i=0;i<LnB-1;i++){/*step 2*/
377
next=((int)pow(2,i))^armci_clus_me;
378
/*printf("\n%d:next=%d \n",armci_me,next);fflush(stdout);*/
379
if(next>=0 && next<armci_nclus){
380
next_node = armci_clus_info[next].master;
381
/*printf("\n%d:node=%d -\n",armci_me,next_node);fflush(stdout);*/
382
srcp = (char *)barr_snd_ptr[next_node];
383
*(int *)srcp = barr_count;
384
dstn = (char *)barr_rcv_ptr[next_node];
385
if(next_node > armci_me){
386
armci_msg_snd(ARMCI_TAG, srcp,4,next_node);
387
armci_msg_rcv(ARMCI_TAG, dstn,4,NULL,next_node);
390
/*would we gain anything by doing a snd,rcv instead of rcv,snd*/
391
armci_msg_rcv(ARMCI_TAG, dstn,4,NULL,next_node);
392
armci_msg_snd(ARMCI_TAG, srcp,4,next_node);
394
armci_util_wait_int((volatile int *)dstn,barr_count,100000);
397
if(last<armci_nclus && !powof2nodes){ /*step 3*/
398
srcp = (char *)barr_snd_ptr[next_nodel];
399
*(int *)srcp = barr_count;
400
armci_msg_snd(ARMCI_TAG, srcp,4,next_nodel);
405
srcp = (char *)barr_snd_ptr[next_nodel];
406
*(int *)srcp = barr_count;
407
dstn = (char *)barr_rcv_ptr[next_nodel];
408
armci_msg_snd(ARMCI_TAG, srcp,4,next_nodel);
409
armci_msg_rcv(ARMCI_TAG, dstn,4,NULL,next_nodel);
410
armci_util_wait_int((volatile int *)dstn,barr_count,100000);
413
} /* paranthesis for if armci_nclus>1*/
414
for(i=1;i<nslave;i++) /*tell smp procs that internode barrier complete*/
415
SET_SHM_FLAG(&(BAR_BUF(i)->flag2),full);
417
else { /*if not master, partake in the smp barrier,only*/
418
i=armci_me-armci_master;
419
SET_SHM_FLAG(&(BAR_BUF(i)->flag1),full);
420
armci_util_wait_int(&BAR_BUF(i)->flag2,FULL,100000);
421
SET_SHM_FLAG(&(BAR_BUF(i)->flag2),empty);
425
#endif /*barrier enabled only for lapi*/
426
void parmci_msg_barrier()
429
bgml_barrier (3); /* this is always faster than MPI_Barrier() */
431
MPI_Barrier(ARMCI_COMM_WORLD);
433
pvm_barrier(mp_group_name, armci_nproc);
435
#if !defined(NEED_MEM_SYNC)
436
if(_armci_barrier_init)
437
_armci_msg_barrier();
441
tcg_synch(ARMCI_TAG);
445
tcg_synch(ARMCI_TAG);
449
/***********************End Barrier Code*************************************/
452
void armci_msg_init(int *argc, char ***argv)
456
tcg_pbegin(*argc,*argv);
462
MPI_Initialized(&flag);
464
# if defined(DCMF) || defined(MPI_MT)
466
MPI_Init_thread(argc, argv, MPI_THREAD_MULTIPLE, &provided);
468
MPI_Init(argc, argv);
471
if (!PARMCI_Initialized()) {
472
MPI_Comm_dup(MPI_COMM_WORLD, &ARMCI_COMM_WORLD);
481
return BGML_Messager_rank();
483
return DCMF_Messager_rank();
485
static int counter = 0;
488
MPI_Comm_rank(ARMCI_COMM_WORLD, &me);
495
return(pvm_getinst(mp_group_name,pvm_mytid()));
497
return (int)tcg_nodeid();
502
int armci_msg_nproc()
505
return BGML_Messager_size();
507
return DCMF_Messager_size();
509
static int counter = 0;
512
MPI_Comm_size(ARMCI_COMM_WORLD, &nproc);
518
return(pvm_gsize(mp_group_name));
520
return (int)tcg_nnodes();
525
#define BROKEN_MPI_ABORT
545
void armci_msg_abort(int code)
548
fprintf(stderr,"ARMCI aborting [%d]\n", code);
550
fprintf(stderr,"ARMCI aborting [%d]\n", code);
552
# ifndef BROKEN_MPI_ABORT
553
MPI_Abort(ARMCI_COMM_WORLD,code);
557
sprintf(error_msg, "ARMCI aborting [%d]", code);
560
tcg_error("ARMCI aborting",(long)code);
562
fprintf(stderr,"%d:aborting\n",armci_me);
563
/* trap for broken abort in message passing libs */
567
void armci_msg_finalize()
576
void armci_msg_bintree(int scope, int* Root, int *Up, int *Left, int *Right)
578
int root, up, left, right, index, nproc;
579
if(scope == SCOPE_NODE){
580
root = armci_clus_info[armci_clus_me].master;
581
nproc = armci_clus_info[armci_clus_me].nslave;
582
index = armci_me - root;
583
up = (index-1)/2 + root; if( up < root) up = -1;
584
left = 2*index + 1 + root; if(left >= root+nproc) left = -1;
585
right = 2*index + 2 + root; if(right >= root+nproc)right = -1;
586
}else if(scope ==SCOPE_MASTERS){
587
root = armci_clus_info[0].master;
589
if(armci_me != armci_master){up = -1; left = -1; right = -1; }
591
index = armci_clus_me - root;
592
up = (index-1)/2 + root;
593
up = ( up < root)? -1: armci_clus_info[up].master;
594
left = 2*index + 1 + root;
595
left = ( left >= root+nproc)? -1: armci_clus_info[left].master;
596
right = 2*index + 2 + root;
597
right =( right >= root+nproc)? -1: armci_clus_info[right].master;
602
index = armci_me - root;
603
up = (index-1)/2 + root; if( up < root) up = -1;
604
left = 2*index + 1 + root; if(left >= root+nproc) left = -1;
605
right = 2*index + 2 + root; if(right >= root+nproc)right = -1;
614
/*\ root broadcasts to everyone else
616
void armci_msg_bcast_scope(int scope, void *buf, int len, int root)
618
int up, left, right, Root;
620
if(!buf)armci_die("armci_msg_bcast: NULL pointer", len);
622
BGTr_Bcast(root, buf, len, 3);
624
armci_msg_bintree(scope, &Root, &up, &left, &right);
627
if(armci_me == root) armci_msg_snd(ARMCI_TAG, buf,len, Root);
628
if(armci_me ==Root) armci_msg_rcv(ARMCI_TAG, buf, len, NULL, root);
631
/* printf("%d: scope=%d left=%d right=%d up=%d\n",armci_me, scope,
634
if(armci_me != Root && up!=-1) armci_msg_rcv(ARMCI_TAG, buf, len, NULL, up);
635
if (left > -1) armci_msg_snd(ARMCI_TAG, buf, len, left);
636
if (right > -1) armci_msg_snd(ARMCI_TAG, buf, len, right);
643
/*\ shared memory based broadcast for a single SMP node
645
void armci_smp_bcast(void *x, int n , int root)
647
int ndo, len,i, bufsize = BUF_SIZE*sizeof(double);
650
if(armci_clus_info[armci_clus_me].nslave<2) return; /* nothing to do */
652
if(!x)armci_die("armci_msg_bcast: NULL pointer", n);
654
/* enable or balance pipeline for messages comparable to bufsize */
655
if((n>bufsize/2) && (n <(2*bufsize-64))){
656
bufsize = n/2; bufsize>>=3; bufsize<<=3;
659
while ((ndo = (n<=bufsize) ? n : bufsize)) {
664
/* wait for the flag protecting the buffer to clear */
665
armci_util_wait_int(&(GOP_BUF(armci_clus_last+bufid)->a.flag),EMPTY,100);
666
SET_SHM_FLAG(&(GOP_BUF(armci_clus_last+bufid)->a.flag),full);
668
for(i=armci_clus_first; i <= armci_clus_last; i++)
669
if(i!=root)armci_util_wait_int(&GOP_BUF(i)->b.flag, EMPTY, 100);
670
armci_copy(x,GOP_BUF(armci_clus_last+bufid+1)->array,len);
671
for(i=armci_clus_first; i <= armci_clus_last; i++)
672
if(i!=root) GOP_BUF(i)->b.flag=FULL;
674
armci_copy(x,GOP_BUF(armci_clus_last+bufid)->array,len);
675
for(i=armci_clus_first; i <= armci_clus_last; i++)
677
armci_util_wait_int(&GOP_BUF(i)->b.flag, EMPTY, 100);
678
SET_SHM_FLAG(&(GOP_BUF(i)->b.flag),full);
682
armci_util_wait_int(&GOP_BUF(armci_me)->b.flag, FULL, 100);
683
armci_copy(GOP_BUF(armci_clus_last+bufid)->array,x,len);
684
SET_SHM_FLAG(&(GOP_BUF(armci_me)->b.flag),empty);
690
bufid = (bufid)%2 +1;
692
/* since root waited for everybody to check in the previous buffer is free*/
694
SET_SHM_FLAG(&(GOP_BUF(armci_clus_last+bufid)->a.flag),empty);
701
/*\ shared memory based broadcast for a single SMP node out of shmem buffer
703
void armci_smp_buf_bcast(void *x, int n, int root, void *shmbuf )
705
int i, nslave = armci_clus_info[armci_clus_me].nslave;
708
armci_copy(shmbuf,x,n);
709
return; /* nothing to do */
711
if(!x)armci_die("armci_msg_bcast: NULL pointer", n);
712
if(!shmbuf)armci_die("armci_msg_bcast: NULL pointer", n);
715
/* notify others that the data in buffer is ready */
716
for(i=armci_clus_first; i <= armci_clus_last; i++)
718
armci_util_wait_int(&GOP_BUF(i)->b.flag, EMPTY, 100);
719
GOP_BUF(i)->b.flag=FULL;
721
/* root also needs to copy */
722
armci_copy(shmbuf,x,n);
723
/* wait until everybody is finished -- can reclaim buffer */
724
for(i=armci_clus_first; i <= armci_clus_last; i++)
725
if(i!=root)armci_util_wait_int(&GOP_BUF(i)->b.flag, EMPTY,100000);
728
/* spin until data in buffer is ready */
729
armci_util_wait_int(&GOP_BUF(armci_me)->b.flag , FULL, 100000);
730
armci_copy(shmbuf,x,n); /* copy data */
731
GOP_BUF(armci_me)->b.flag = EMPTY; /* indicate we are done */
735
void _armci_msg_binomial_bcast(void *buf, int len, int root){
736
int Root = armci_master;
737
int nslave = armci_clus_info[armci_clus_me].nslave;
738
int i,next_node,next;
739
/* int my_rank,root_rank,next_rank; */
740
/* inter-node operation between masters */
741
if(root !=armci_clus_info[0].master){
742
Root = armci_clus_info[0].master;
743
if(armci_me == root) armci_msg_snd(ARMCI_TAG, buf,len, Root);
744
if(armci_me ==Root) armci_msg_rcv(ARMCI_TAG, buf, len, NULL, root);
748
if(armci_nclus>1 &&armci_me==armci_master){/*the internode bcast, first*/
749
/*first do the recv*/
750
int rcv_proc=armci_clus_me,flag=1,diff=1;
752
while(!(rcv_proc & flag)){
756
rcv_proc = armci_clus_info[armci_clus_me-diff].master;
757
armci_msg_rcv(ARMCI_TAG, buf,len,NULL,rcv_proc);
758
/*printf("\n%d: recv from %d \n",armci_me,rcv_proc);fflush(stdout);*/
763
/*printf("\n%d: %d diff>>1 = %d\n",armci_me,Lp2,diff>>1);*/
764
for(i=diff>>1;i>=1;i=i>>1){
765
next=i^armci_clus_me;
766
if(next>=0 && next<armci_nclus){
767
next_node = armci_clus_info[next].master;
768
armci_msg_snd(ARMCI_TAG, buf,len,next_node);
769
/*printf("\n%d: send to %d \n",armci_me,next_node);fflush(stdout);*/
774
if(_armci_gop_shmem && nslave<33)
775
armci_smp_bcast(buf, len, Root);
777
armci_msg_bcast_scope(SCOPE_NODE, buf, len, Root);
782
#ifndef armci_msg_bcast
783
/*\ SMP-aware global broadcast routine
785
void armci_msg_bcast(void *buf, int len, int root)
787
int Root = armci_master;
788
int nslave = armci_clus_info[armci_clus_me].nslave;
790
#if defined(MPI_SPAWN) || defined(MPI_MT)
791
armci_msg_bcast_scope(SCOPE_ALL, (buf), (len), (root));
795
if(_armci_gop_init){_armci_msg_binomial_bcast(buf,len,root);return;}
797
/* inter-node operation between masters */
798
if(armci_nclus>1)armci_msg_bcast_scope(SCOPE_MASTERS, buf, len, root);
801
/* intra-node operation */
803
if(_armci_gop_shmem && nslave<33)
804
armci_smp_bcast(buf, len, Root);
807
armci_msg_bcast_scope(SCOPE_NODE, buf, len, Root);
813
void armci_msg_brdcst(void* buffer, int len, int root)
815
if(!buffer)armci_die("armci_msg_brdcast: NULL pointer", len);
818
BGTr_Bcast(root, buffer, len, PCLASS);
820
MPI_Bcast(buffer, len, MPI_CHAR, root, ARMCI_COMM_WORLD);
822
armci_msg_bcast(buffer, len, root);
825
long ttag=ARMCI_TAG, llen=len, rroot=root;
826
tcg_brdcst(ttag, buffer, llen, rroot);
832
void armci_msg_snd(int tag, void* buffer, int len, int to)
835
MPI_Send(buffer, len, MPI_CHAR, to, tag, ARMCI_COMM_WORLD);
837
pvm_psend(pvm_gettid(mp_group_name, to), tag, buffer, len, PVM_BYTE);
839
/* We don't actually used armci_msg_snd in ARMCI. we use optimized
841
* armci_msg_snd is used. If you build Global Arrays, the MPI flag is
845
armci_die("bgl shouldn't use armci_msg_snd", armci_me);
847
long ttag=tag, llen=len, tto=to, block=1;
848
tcg_snd(ttag, buffer, llen, tto, block);
853
/*\ receive message of specified tag from proc and get its len if msglen!=NULL
855
void armci_msg_rcv(int tag, void* buffer, int buflen, int *msglen, int from)
859
MPI_Recv(buffer, buflen, MPI_CHAR, from, tag, ARMCI_COMM_WORLD, &status);
860
if(msglen) MPI_Get_count(&status, MPI_CHAR, msglen);
863
pvm_precv(pvm_gettid(mp_group_name, from), tag, buffer, buflen, PVM_BYTE,
865
if(msglen)*msglen=mlen;
867
armci_die("bgl shouldn't use armci_msg_rcv", armci_me);
869
long ttag=tag, llen=buflen, mlen, ffrom=from, sender, block=1;
870
tcg_rcv(ttag, buffer, llen, &mlen, ffrom, &sender, block);
871
if(msglen)*msglen = (int)mlen;
876
int armci_msg_rcvany(int tag, void* buffer, int buflen, int *msglen)
882
ierr = MPI_Recv(buffer, buflen, MPI_CHAR, MPI_ANY_SOURCE, tag,
883
ARMCI_COMM_WORLD, &status);
884
if(ierr != MPI_SUCCESS) armci_die("armci_msg_rcvany: Recv failed ", tag);
886
if(msglen)if(MPI_SUCCESS!=MPI_Get_count(&status, MPI_CHAR, msglen))
887
armci_die("armci_msg_rcvany: count failed ", tag);
888
return (int)status.MPI_SOURCE;
891
pvm_precv(-1, tag, buffer, buflen, PVM_BYTE, &src, &rtag, &mlen);
892
if(msglen)*msglen=mlen;
893
return(pvm_getinst(mp_group_name,src));
894
# elif defined (BGML)
895
armci_die("bgl shouldn't use armci_msg_rcvany", armci_me);
897
long ttag=tag, llen=buflen, mlen, ffrom=-1, sender, block=1;
898
tcg_rcv(ttag, buffer, llen, &mlen, ffrom, &sender, block);
899
if(msglen)*msglen = (int)mlen;
905
/*\ cluster master broadcasts to everyone else in the same cluster
907
void armci_msg_clus_brdcst(void *buf, int len)
909
int root, up, left, right;
910
int tag=ARMCI_TAG, lenmes;
912
armci_msg_bintree(SCOPE_NODE, &root, &up, &left, &right);
913
if(armci_me != root) armci_msg_rcv(tag, buf, len, &lenmes, up);
914
if (left > -1) armci_msg_snd(tag, buf, len, left);
915
if (right > -1) armci_msg_snd(tag, buf, len, right);
919
/*\ reduce operation for long
921
static void ldoop(int n, char *op, long *x, long* work)
923
if (strncmp(op,"+",1) == 0)
926
else if (strncmp(op,"*",1) == 0)
929
else if (strncmp(op,"max",3) == 0)
931
*x = ARMCI_MAX(*x, *work);
934
else if (strncmp(op,"min",3) == 0)
936
*x = ARMCI_MIN(*x, *work);
939
else if (strncmp(op,"absmax",6) == 0)
941
register long x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work);
942
*x = ARMCI_MAX(x1, x2);
945
else if (strncmp(op,"absmin",6) == 0)
947
register long x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work);
948
*x = ARMCI_MIN(x1, x2);
951
else if (strncmp(op,"or",2) == 0)
957
armci_die("ldoop: unknown operation requested", n);
960
/*\ reduce operation for long x= op(work,work2)
962
static void ldoop2(int n, char *op, long *x, long* work, long* work2)
964
if (strncmp(op,"+",1) == 0)
966
*x++ = *work++ + *work2++;
967
else if (strncmp(op,"*",1) == 0)
969
*x++ = *work++ * *work2++;
970
else if (strncmp(op,"max",3) == 0)
972
*x = ARMCI_MAX(*work2, *work);
973
x++; work++; work2++;
975
else if (strncmp(op,"min",3) == 0)
977
*x = ARMCI_MIN(*work2, *work);
978
x++; work++; work2++;
980
else if (strncmp(op,"absmax",6) == 0)
982
register long x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2);
983
*x = ARMCI_MAX(x1, x2);
984
x++; work++; work2++;
986
else if (strncmp(op,"absmin",6) == 0)
988
register long x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2);
989
*x = ARMCI_MIN(x1, x2);
990
x++; work++; work2++;
992
else if (strncmp(op,"or",2) == 0)
995
x++; work++; work2++;
998
armci_die("ldoop2: unknown operation requested", n);
1001
/*\ reduce operation for long long
1003
static void lldoop(int n, char *op, long long *x, long long* work)
1005
if (strncmp(op,"+",1) == 0)
1008
else if (strncmp(op,"*",1) == 0)
1011
else if (strncmp(op,"max",3) == 0)
1013
*x = ARMCI_MAX(*x, *work);
1016
else if (strncmp(op,"min",3) == 0)
1018
*x = ARMCI_MIN(*x, *work);
1021
else if (strncmp(op,"absmax",6) == 0)
1023
register long long x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work);
1024
*x = ARMCI_MAX(x1, x2);
1027
else if (strncmp(op,"absmin",6) == 0)
1029
register long long x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work);
1030
*x = ARMCI_MIN(x1, x2);
1033
else if (strncmp(op,"or",2) == 0)
1039
armci_die("lldoop: unknown operation requested", n);
1042
/*\ reduce operation for long long x= op(work,work2)
1044
static void lldoop2(int n, char *op, long long *x, long long* work,
1047
if (strncmp(op,"+",1) == 0)
1049
*x++ = *work++ + *work2++;
1050
else if (strncmp(op,"*",1) == 0)
1052
*x++ = *work++ * *work2++;
1053
else if (strncmp(op,"max",3) == 0)
1055
*x = ARMCI_MAX(*work2, *work);
1056
x++; work++; work2++;
1058
else if (strncmp(op,"min",3) == 0)
1060
*x = ARMCI_MIN(*work2, *work);
1061
x++; work++; work2++;
1063
else if (strncmp(op,"absmax",6) == 0)
1065
register long long x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2);
1066
*x = ARMCI_MAX(x1, x2);
1067
x++; work++; work2++;
1069
else if (strncmp(op,"absmin",6) == 0)
1071
register long long x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2);
1072
*x = ARMCI_MIN(x1, x2);
1073
x++; work++; work2++;
1075
else if (strncmp(op,"or",2) == 0)
1077
*x = *work | *work2;
1078
x++; work++; work2++;
1081
armci_die("ldoop2: unknown operation requested", n);
1084
/*\ reduce operation for int
1086
static void idoop(int n, char *op, int *x, int* work)
1088
if (strncmp(op,"+",1) == 0)
1091
else if (strncmp(op,"*",1) == 0)
1094
else if (strncmp(op,"max",3) == 0)
1096
*x = ARMCI_MAX(*x, *work);
1099
else if (strncmp(op,"min",3) == 0)
1101
*x = ARMCI_MIN(*x, *work);
1104
else if (strncmp(op,"absmax",6) == 0)
1106
register int x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work);
1107
*x = ARMCI_MAX(x1, x2);
1110
else if (strncmp(op,"absmin",6) == 0)
1112
register int x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work);
1113
*x = ARMCI_MIN(x1, x2);
1116
else if (strncmp(op,"or",2) == 0)
1122
armci_die("idoop: unknown operation requested", n);
1125
/*\ reduce operation for int x= op(work,work2)
1127
static void idoop2(int n, char *op, int *x, int* work, int* work2)
1129
if (strncmp(op,"+",1) == 0)
1131
*x++ = *work++ + *work2++;
1132
else if (strncmp(op,"*",1) == 0)
1134
*x++ = *work++ * *work2++;
1135
else if (strncmp(op,"max",3) == 0)
1137
*x = ARMCI_MAX(*work2, *work);
1138
x++; work++; work2++;
1140
else if (strncmp(op,"min",3) == 0)
1142
*x = ARMCI_MIN(*work2, *work);
1143
x++; work++; work2++;
1145
else if (strncmp(op,"absmax",6) == 0)
1147
register int x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2);
1148
*x = ARMCI_MAX(x1, x2);
1149
x++; work++; work2++;
1151
else if (strncmp(op,"absmin",6) == 0)
1153
register int x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2);
1154
*x = ARMCI_MIN(x1, x2);
1155
x++; work++; work2++;
1157
else if (strncmp(op,"or",2) == 0)
1159
*x = *work | *work2;
1160
x++; work++; work2++;
1163
armci_die("idoop2: unknown operation requested", n);
1166
/*\ reduce operation for double
1168
static void ddoop(int n, char* op, double* x, double* work)
1170
if (strncmp(op,"+",1) == 0){
1171
if(n>63) FORT_DADD(&n,x,work);
1172
else while(n--) *x++ += *work++;
1173
}else if (strncmp(op,"*",1) == 0){
1174
if(n>63) FORT_DMULT(&n,x,work);
1175
else while(n--) *x++ *= *work++;
1176
}else if (strncmp(op,"max",3) == 0)
1178
*x = ARMCI_MAX(*x, *work);
1181
else if (strncmp(op,"min",3) == 0)
1183
*x = ARMCI_MIN(*x, *work);
1186
else if (strncmp(op,"absmax",6) == 0)
1188
register double x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work);
1189
*x = ARMCI_MAX(x1, x2);
1192
else if (strncmp(op,"absmin",6) == 0)
1194
register double x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work);
1195
*x = ARMCI_MIN(x1, x2);
1199
armci_die("ddoop: unknown operation requested", n);
1202
/*\ reduce operation for double x= op(work,work2)
1204
static void ddoop2(int n, char *op, double *x, double* work, double* work2)
1206
if (strncmp(op,"+",1) == 0){
1207
if(n>63) FORT_DADD2(&n,x,work,work2);
1208
else while(n--) *x++ = *work++ + *work2++;
1209
}else if (strncmp(op,"*",1) == 0){
1210
if(n>63) FORT_DMULT2(&n,x,work,work2);
1211
while(n--) *x++ = *work++ * *work2++;
1212
}else if (strncmp(op,"max",3) == 0)
1214
*x = ARMCI_MAX(*work2, *work);
1215
x++; work++; work2++;
1217
else if (strncmp(op,"min",3) == 0)
1219
*x = ARMCI_MIN(*work2, *work);
1220
x++; work++; work2++;
1222
else if (strncmp(op,"absmax",6) == 0)
1224
register double x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2);
1225
*x = ARMCI_MAX(x1, x2);
1226
x++; work++; work2++;
1228
else if (strncmp(op,"absmin",6) == 0)
1230
register double x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2);
1231
*x = ARMCI_MIN(x1, x2);
1232
x++; work++; work2++;
1235
armci_die("ddoop2: unknown operation requested", n);
1239
/*\ reduce operation for float
1241
static void fdoop(int n, char* op, float* x, float* work)
1243
if (strncmp(op,"+",1) == 0)
1246
else if (strncmp(op,"*",1) == 0)
1249
else if (strncmp(op,"max",3) == 0)
1251
*x = ARMCI_MAX(*x, *work);
1254
else if (strncmp(op,"min",3) == 0)
1256
*x = ARMCI_MIN(*x, *work);
1259
else if (strncmp(op,"absmax",6) == 0)
1261
register float x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work);
1262
*x = ARMCI_MAX(x1, x2);
1265
else if (strncmp(op,"absmin",6) == 0)
1267
register float x1 = ARMCI_ABS(*x), x2 = ARMCI_ABS(*work);
1268
*x = ARMCI_MIN(x1, x2);
1272
armci_die("fdoop: unknown operation requested", n);
1275
/*\ reduce operation for float x= op(work,work2)
1277
static void fdoop2(int n, char *op, float *x, float* work, float* work2)
1279
if (strncmp(op,"+",1) == 0)
1281
*x++ = *work++ + *work2++;
1282
else if (strncmp(op,"*",1) == 0)
1284
*x++ = *work++ * *work2++;
1285
else if (strncmp(op,"max",3) == 0)
1287
*x = ARMCI_MAX(*work2, *work);
1288
x++; work++; work2++;
1290
else if (strncmp(op,"min",3) == 0)
1292
*x = ARMCI_MIN(*work2, *work);
1293
x++; work++; work2++;
1295
else if (strncmp(op,"absmax",6) == 0)
1297
register float x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2);
1298
*x = ARMCI_MAX(x1, x2);
1299
x++; work++; work2++;
1301
else if (strncmp(op,"absmin",6) == 0)
1303
register float x1 = ARMCI_ABS(*work), x2 = ARMCI_ABS(*work2);
1304
*x = ARMCI_MIN(x1, x2);
1305
x++; work++; work2++;
1308
armci_die("fdoop2: unknown operation requested", n);
1311
/*\ combine array of longs/ints accross all processes
1313
void armci_msg_gop_scope(int scope, void *x, int n, char* op, int type)
1315
int root, up, left, right, size;
1317
int ndo, len, lenmes, orign =n, ratio;
1319
if(!x)armci_die("armci_msg_gop: NULL pointer", n);
1320
if(work==NULL)_allocate_mem_for_work();
1325
if(n > 0 && (strncmp(op, "+", 1) == 0) && (type==ARMCI_INT || type==ARMCI_DOUBLE))
1330
else if(type==ARMCI_DOUBLE)
1332
BGTr_Allreduce(origx, x, n, dt, theop, -1, PCLASS);
1334
else if(n > 0 && (strncmp(op, "max", 3) == 0) && (type==ARMCI_INT || type==ARMCI_DOUBLE))
1339
else if(type==ARMCI_DOUBLE)
1341
BGTr_Allreduce(origx, x, n, dt, theop, -1, PCLASS);
1343
else if(n > 0 && (strncmp(op, "min", 3) == 0) && (type==ARMCI_INT || type==ARMCI_DOUBLE))
1348
else if(type==ARMCI_DOUBLE)
1350
BGTr_Allreduce(origx, x, n, dt, theop, -1, PCLASS);
1355
armci_msg_bintree(scope, &root, &up, &left, &right);
1357
if(type==ARMCI_INT) size = sizeof(int);
1358
else if(type==ARMCI_LONG) size = sizeof(long);
1359
else if(type==ARMCI_LONG_LONG) size = sizeof(long long);
1360
else if(type==ARMCI_FLOAT) size = sizeof(float);
1361
else size = sizeof(double);
1363
ratio = sizeof(double)/size;
1365
while ((ndo = (n<=BUF_SIZE*ratio) ? n : BUF_SIZE*ratio)) {
1366
len = lenmes = ndo*size;
1369
armci_msg_rcv(tag, lwork, len, &lenmes, left);
1370
if(type==ARMCI_INT) idoop(ndo, op, (int*)x, iwork);
1371
else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, lwork);
1372
else if(type==ARMCI_LONG_LONG) lldoop(ndo, op,(long long*)x,llwork);
1373
else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, fwork);
1374
else ddoop(ndo, op, (double*)x, work);
1378
armci_msg_rcv(tag, lwork, len, &lenmes, right);
1379
if(type==ARMCI_INT) idoop(ndo, op, (int*)x, iwork);
1380
else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, lwork);
1381
else if(type==ARMCI_LONG_LONG) lldoop(ndo, op,(long long*)x,llwork);
1382
else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, fwork);
1383
else ddoop(ndo, op, (double*)x, work);
1385
if (armci_me != root && up!=-1) armci_msg_snd(tag, x, len, up);
1391
/* Now, root broadcasts the result down the binary tree */
1393
armci_msg_bcast_scope(scope, origx, len, root);
1398
void armci_msg_reduce_scope(int scope, void *x, int n, char* op, int type)
1400
int root, up, left, right, size;
1402
int ndo, len, lenmes, ratio;
1403
if(!x)armci_die("armci_msg_gop: NULL pointer", n);
1404
if(work==NULL)_allocate_mem_for_work();
1406
armci_msg_bintree(scope, &root, &up, &left, &right);
1408
if(type==ARMCI_INT) size = sizeof(int);
1409
else if(type==ARMCI_LONG) size = sizeof(long);
1410
else if(type==ARMCI_LONG_LONG) size = sizeof(long long);
1411
else if(type==ARMCI_FLOAT) size = sizeof(float);
1412
else size = sizeof(double);
1414
ratio = sizeof(double)/size;
1416
while ((ndo = (n<=BUF_SIZE*ratio) ? n : BUF_SIZE*ratio)) {
1417
len = lenmes = ndo*size;
1420
armci_msg_rcv(tag, lwork, len, &lenmes, left);
1421
if(type==ARMCI_INT) idoop(ndo, op, (int*)x, iwork);
1422
else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, lwork);
1423
else if(type==ARMCI_LONG_LONG) lldoop(ndo, op,(long long*)x,llwork);
1424
else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, fwork);
1425
else ddoop(ndo, op, (double*)x, work);
1429
armci_msg_rcv(tag, lwork, len, &lenmes, right);
1430
if(type==ARMCI_INT) idoop(ndo, op, (int*)x, iwork);
1431
else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, lwork);
1432
else if(type==ARMCI_LONG_LONG) lldoop(ndo, op,(long long*)x,llwork);
1433
else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, fwork);
1434
else ddoop(ndo, op, (double*)x, work);
1436
if (armci_me != root && up!=-1) armci_msg_snd(tag, x, len, up);
1443
static void gop(int type, int ndo, char* op, void *x, void *work)
1445
if(type==ARMCI_INT) idoop(ndo, op, (int*)x, (int*)work);
1446
else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, (long*)work);
1447
else if(type==ARMCI_LONG_LONG) lldoop(ndo, op, (long long*)x, (long long*)work);
1448
else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, (float*)work);
1449
else ddoop(ndo, op, (double*)x, (double*)work);
1453
static void gop2(int type, int ndo, char* op, void *x, void *work, void *work2)
1457
if(type==ARMCI_INT) size = sizeof(int);
1458
else if(type==ARMCI_LONG) size = sizeof(long);
1459
else if(type==ARMCI_LONG_LONG) size = sizeof(long long);
1460
else if(type==ARMCI_FLOAT) size = sizeof(float);
1461
else size = sizeof(double);
1463
armci_copy(work2,x,ndo*size);
1465
if(type==ARMCI_INT) idoop(ndo, op, (int*)x, (int*)work);
1466
else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, (long*)work);
1467
else if(type==ARMCI_LONG_LONG) lldoop(ndo, op, (long long*)x, (long long*)work);
1468
else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, (float*)work);
1469
else ddoop(ndo, op, (double*)x, (double*)work);
1471
if(type==ARMCI_INT) idoop2(ndo, op, (int*)x, (int*)work, (int*)work2);
1472
else if(type==ARMCI_LONG)ldoop2(ndo,op,(long*)x,(long*)work,(long*)work2);
1473
else if(type==ARMCI_LONG_LONG) lldoop2(ndo,op,(long long*)x,(long long*)work,(long long*)work2);
1474
else if(type==ARMCI_FLOAT)fdoop2(ndo,op,(float*)x,(float*)work,(float*)work2);
1475
else ddoop2(ndo, op, (double*)x, (double*)work,(double*)work2);
1482
/*\ shared memory based reduction for a single SMP node
1484
static void armci_smp_reduce(void *x, int n, char* op, int type)
1486
int root, up, left, right, size;
1487
int ndo, len, lenmes, ratio;
1488
int nslave = armci_clus_info[armci_clus_me].nslave;
1490
if(nslave<2) return; /* nothing to do */
1492
if(!x)armci_die("armci_msg_gop: NULL pointer", n);
1494
armci_msg_bintree(SCOPE_NODE, &root, &up, &left, &right);
1496
if(type==ARMCI_INT) size = sizeof(int);
1497
else if(type==ARMCI_LONG) size = sizeof(long);
1498
else if(type==ARMCI_LONG_LONG) size = sizeof(long long);
1499
else if(type==ARMCI_FLOAT) size = sizeof(float);
1500
else size = sizeof(double);
1501
ratio = sizeof(double)/size;
1503
while ((ndo = (n<=BUF_SIZE*ratio) ? n : BUF_SIZE*ratio)) {
1504
len = lenmes = ndo*size;
1506
armci_util_wait_int(&GOP_BUF(armci_me)->a.flag, EMPTY, 100);
1509
if(left<0 && right<0) armci_copy(x,GOP_BUF(armci_me)->array,len);
1511
/* version oblivious to the order of data arrival */
1513
int need_left = left >-1;
1514
int need_right = right >-1;
1515
int from, first =1, maxspin=100, count=0;
1518
while(need_left || need_right){
1520
if(need_left && GOP_BUF(left)->a.flag == FULL){
1523
}else if(need_right && GOP_BUF(right)->a.flag == FULL) {
1530
if(armci_me == root) gop(type, ndo, op, x, b->array);
1533
gop2(type, ndo, op, GOP_BUF(armci_me)->array, b->array,x);
1535
gop(type, ndo, op, GOP_BUF(armci_me)->array, b->array);
1539
gop(type, ndo, op, GOP_BUF(armci_me)->array, b->array);
1541
SET_SHM_FLAG(&( b->a.flag),empty);
1542
}else if((++count)<maxspin) armci_util_spin(count,_gop_buffer);
1543
else{cpu_yield();count =0; }
1548
armci_copy(x,GOP_BUF(armci_me)->array,len);
1550
/* this version requires a specific order of data arrival */
1552
while(GOP_BUF(left)->a.flag != FULL) cpu_yield();
1553
gop(type, ndo, op, GOP_BUF(armci_me)->array, GOP_BUF(left)->array);
1554
SET_SHM_FLAG(&( GOP_BUF(left)->a.flag),empty);
1557
while(GOP_BUF(right)->a.flag != FULL) cpu_yield();
1558
gop(type, ndo, op, GOP_BUF(armci_me)->array, GOP_BUF(right)->array);
1559
GOP_BUF(right)->a.flag = EMPTY;
1563
if (armci_me != root ) {
1564
SET_SHM_FLAG(&(GOP_BUF(armci_me)->a.flag),full);
1568
/* NOTE: this copy can be eliminated in a cluster configuration */
1569
armci_copy(GOP_BUF(armci_me)->array,x,len);
1577
void _armci_msg_binomial_reduce(void *x, int n, char* op, int type){
1578
int root = armci_clus_info[0].master;
1579
int i,next_node,next;
1580
int size, ratio, ndo, lenmes,len;
1581
/* int my_rank,root_rank,next_rank; */
1582
if(work==NULL)_allocate_mem_for_work();
1583
if(armci_me!=armci_master)return;
1584
if(type==ARMCI_INT) size = sizeof(int);
1585
else if(type==ARMCI_LONG) size = sizeof(long);
1586
else if(type==ARMCI_LONG_LONG) size = sizeof(long long);
1587
else if(type==ARMCI_FLOAT) size = sizeof(float);
1588
else size = sizeof(double);
1589
ratio = sizeof(double)/size;
1591
while ((ndo = (n<=BUF_SIZE*ratio) ? n : BUF_SIZE*ratio)) {
1592
int snd_proc=armci_clus_me,flag=1,diff=1;
1594
len = lenmes = ndo*size;
1596
while(!(snd_proc & flag)){
1600
snd_proc = armci_clus_info[armci_clus_me-diff].master;
1605
/*printf("\n%d: %d diff>>1 = %d\n",armci_me,Lp2,diff>>1);*/
1606
for(i=diff>>1;i>=1;i=i>>1){
1607
next=i^armci_clus_me;
1608
if(next>=0 && next<armci_nclus){
1609
next_node = armci_clus_info[next].master;
1610
armci_msg_rcv(ARMCI_TAG, lwork,len,&lenmes,next_node);
1611
if(type==ARMCI_INT) idoop(ndo, op, (int*)x, iwork);
1612
else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, lwork);
1613
else if(type==ARMCI_LONG_LONG) lldoop(ndo, op,(long long*)x,llwork);
1614
else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, fwork);
1615
else ddoop(ndo, op, (double*)x, work);
1616
/*printf("\n%d: recvd from %d \n",armci_me,next_node);*/
1619
/*if(armci_me!=root){
1620
printf("\n%d: send to %d \n",armci_me,snd_proc);
1624
armci_msg_snd(ARMCI_TAG,x,len,snd_proc);
1630
void armci_msg_reduce(void *x, int n, char* op, int type)
1632
if(DEBUG_)printf("%d reduce %d\n",armci_me, n);
1633
/* intra-node operation */
1636
if(_armci_gop_shmem)
1637
armci_smp_reduce(x, n, op, type);
1640
armci_msg_reduce_scope(SCOPE_NODE, x, n, op, type);
1642
/* inter-node operation between masters */
1646
_armci_msg_binomial_reduce(x,n,op,type);
1649
armci_msg_reduce_scope(SCOPE_MASTERS, x, n, op, type);
1654
static void armci_msg_gop2(void *x, int n, char* op, int type)
1657
if(work==NULL)_allocate_mem_for_work();
1658
if(type==ARMCI_INT) size = sizeof(int);
1659
else if(type==ARMCI_LONG) size = sizeof(long);
1660
else if(type==ARMCI_LONG_LONG) size = sizeof(long long);
1661
else if(type==ARMCI_FLOAT) size = sizeof(float);
1662
else size = sizeof(double);
1663
#ifdef BGML /*optimize what we can at the message layer */
1668
if(n>0 && (strncmp(op, "+", 1) == 0))
1671
if(type == ARMCI_INT)
1674
BGTr_Allreduce(origx, x, n, dt, rop, -1, 3);
1676
else if(type == ARMCI_LONG || type == ARMCI_LONG_LONG)
1678
armci_msg_reduce(x, n, op, type);
1679
armci_msg_bcast(x, size*n, root);
1680
/* dt=BGML_UNSIGNED_LONG; */
1681
/* BGTr_Allreduce(origx, x, n, dt, rop, -1, 3);*/
1683
else if(type == ARMCI_DOUBLE)
1686
BGTr_Allreduce(origx, x, n, dt, rop, -1, 3);
1688
else if(type == ARMCI_FLOAT)
1690
armci_msg_reduce(x, n, op, type);
1691
armci_msg_bcast(x, size*n, root);
1695
fprintf(stderr,"Unknown data type\n");
1700
else if(n>0 && ((strncmp(op, "max", 3) == 0) || (strncmp(op, "min", 3) ==0 )))
1702
if(strncmp(op, "max", 3) == 0)
1707
if(type == ARMCI_INT)
1709
else if(type == ARMCI_DOUBLE)
1711
else if(type == ARMCI_FLOAT)
1713
else if(type == ARMCI_LONG)
1714
dt=BGML_SIGNED_LONG;
1715
else if(type == ARMCI_LONG_LONG)
1717
armci_msg_reduce(x, n, op, type);
1718
armci_msg_bcast(x, size*n, root);
1722
fprintf(stderr,"Unknown data type\n");
1725
if(type != ARMCI_LONG_LONG)
1726
BGTr_Allreduce(origx, x, n, dt, rop, -1, 3);
1731
{ /* brackets needed for final gelse clause of bgml */
1733
armci_msg_reduce(x, n, op, type);
1734
armci_msg_bcast(x, size*n, root);
1739
static void armci_sel(int type, char *op, void *x, void* work, int n)
1744
if(strncmp(op,"min",3) == 0){
1745
if(*(int*)x > *(int*)work) selected=1;
1747
if(*(int*)x < *(int*)work) selected=1;
1750
if(strncmp(op,"min",3) == 0){
1751
if(*(long*)x > *(long*)work) selected=1;
1753
if(*(long*)x < *(long*)work) selected=1;
1755
case ARMCI_LONG_LONG:
1756
if(strncmp(op,"min",3) == 0){
1757
if(*(long long*)x > *(long long*)work) selected=1;
1759
if(*(long long*)x < *(long long*)work) selected=1;
1762
if(strncmp(op,"min",3) == 0){
1763
if(*(float*)x > *(float*)work) selected=1;
1765
if(*(float*)x < *(float*)work) selected=1;
1768
if(strncmp(op,"min",3) == 0){
1769
if(*(double*)x > *(double*)work) selected=1;
1771
if(*(double*)x < *(double*)work) selected=1;
1773
if(selected)armci_copy(work,x, n);
1778
/*\ global for op with extra info
1780
void armci_msg_sel_scope(int scope, void *x, int n, char* op, int type, int contribute)
1782
int root, up, left, right;
1784
int len, lenmes, min;
1786
min = (strncmp(op,"min",3) == 0);
1787
if(!min && (strncmp(op,"max",3) != 0))
1788
armci_die("armci_msg_gop_info: operation not supported ", 0);
1790
if(!x)armci_die("armci_msg_gop_info: NULL pointer", n);
1792
if(n>((int)INFO_BUF_SIZE))armci_die("armci_msg_gop_info: info too large",n);
1796
armci_msg_bintree(scope, &root, &up, &left, &right);
1800
/* receive into work if contributing otherwise into x */
1801
if(contribute)armci_msg_rcv(tag, work, len, &lenmes, left);
1802
else armci_msg_rcv(tag, x, len, &lenmes, left);
1805
if(contribute) armci_sel(type, op, x, work, n);
1806
else contribute =1; /* now we got data to pass */
1811
/* receive into work if contributing otherwise into x */
1812
if(contribute) armci_msg_rcv(tag, work, len, &lenmes, right);
1813
else armci_msg_rcv(tag, x, len, &lenmes, right);
1816
if(contribute) armci_sel(type, op, x, work, n);
1817
else contribute =1; /* now we got data to pass */
1821
if (armci_me != root){
1822
if(contribute) armci_msg_snd(tag, x, len, up);
1823
else armci_msg_snd(tag, x, 0, up); /* send 0 bytes */
1826
/* Now, root broadcasts the result down the binary tree */
1827
armci_msg_bcast_scope(scope, x, n, root);
1831
/*\ combine array of longs/ints/doubles accross all processes
1836
void armci_msg_igop(int *x, int n, char* op)
1837
{ armci_msg_gop_scope(SCOPE_ALL,x, n, op, ARMCI_INT); }
1839
void armci_msg_lgop(long *x, int n, char* op)
1840
{ armci_msg_gop_scope(SCOPE_ALL,x, n, op, ARMCI_LONG); }
1842
void armci_msg_llgop(long long *x, int n, char* op)
1843
{ armci_msg_gop_scope(SCOPE_ALL,x, n, op, ARMCI_LONG_LONG); }
1845
void armci_msg_dgop(double *x, int n, char* op)
1846
{ armci_msg_gop_scope(SCOPE_ALL,x, n, op, ARMCI_DOUBLE); }
1848
void armci_msg_fgop (float *x, int n, char* op)
1849
{ armci_msg_gop_scope(SCOPE_ALL,x, n, op, ARMCI_FLOAT);}
1852
void armci_msg_igop(int *x, int n, char* op) { armci_msg_gop2(x, n, op, ARMCI_INT); }
1853
void armci_msg_lgop(long *x, int n, char* op) { armci_msg_gop2(x, n, op, ARMCI_LONG); }
1854
void armci_msg_llgop(long long *x, int n, char* op) { armci_msg_gop2(x, n, op, ARMCI_LONG_LONG); }
1855
void armci_msg_fgop(float *x, int n, char* op) { armci_msg_gop2(x, n, op, ARMCI_FLOAT); }
1856
void armci_msg_dgop(double *x, int n, char* op) { armci_msg_gop2(x, n, op, ARMCI_DOUBLE); }
1860
/*\ add array of longs/ints within the same cluster node
1862
void armci_msg_clus_igop(int *x, int n, char* op)
1863
{ armci_msg_gop_scope(SCOPE_NODE,x, n, op, ARMCI_INT); }
1865
void armci_msg_clus_lgop(long *x, int n, char* op)
1866
{ armci_msg_gop_scope(SCOPE_NODE,x, n, op, ARMCI_LONG); }
1868
void armci_msg_clus_llgop(long long *x, int n, char* op)
1869
{ armci_msg_gop_scope(SCOPE_NODE,x, n, op, ARMCI_LONG_LONG); }
1871
void armci_msg_clus_fgop(float *x, int n, char* op)
1872
{ armci_msg_gop_scope(SCOPE_NODE,x, n, op, ARMCI_FLOAT); }
1874
void armci_msg_clus_dgop_scope(double *x, int n, char* op)
1875
{ armci_msg_gop_scope(SCOPE_NODE,x, n, op, ARMCI_DOUBLE); }
1879
void armci_exchange_address(void *ptr_ar[], int n)
1881
int ratio = sizeof(void*)/sizeof(int);
1883
armci_msg_lgop((long*)ptr_ar, n, "+");
1885
if(DEBUG_)printf("%d: exchanging %ld ratio=%d\n",armci_me,(long)ptr_ar[armci_me],ratio);
1887
armci_msg_gop2(ptr_ar, n*ratio, "+",ARMCI_INT);
1891
* ********************* Begin ARMCI Groups Code ****************************
1892
* NOTE: This part is MPI dependent (i.e. ifdef MPI)
1895
MPI_Comm armci_group_comm(ARMCI_Group *group)
1898
return MPI_COMM_NULL;
1900
ARMCI_iGroup *igroup = armci_get_igroup_from_group(group);
1901
return (MPI_Comm)igroup->icomm;
1905
void parmci_msg_group_barrier(ARMCI_Group *group)
1907
ARMCI_iGroup *igroup = armci_get_igroup_from_group(group);
1912
armci_msg_group_igop(&val, 1, "+", group);
1915
MPI_Barrier((MPI_Comm)(igroup->icomm));
1920
extern void ARMCI_Bcast_(void *buffer, int len, int root, ARMCI_Group *group);
1922
extern void ARMCI_Bcast_(void *buffer, int len, int root, ARMCI_Comm comm);
1924
void armci_grp_clus_brdcst(void *buf, int len, int grp_master,
1925
int grp_clus_nproc, ARMCI_Group *mastergroup) {
1926
ARMCI_iGroup *igroup = armci_get_igroup_from_group(mastergroup);
1927
int i, *pid_list, root=0;
1931
MPI_Group group_world;
1936
/* create a communicator for the processes with in a node */
1937
pid_list = (int *)malloc(grp_clus_nproc*sizeof(int));
1938
for(i=0; i<grp_clus_nproc; i++) pid_list[i] = grp_master+i;
1942
ARMCI_Group_create_child(grp_clus_nproc, pid_list, &group, mastergroup);
1943
ARMCI_Bcast_(buf, len, root, &group);
1944
ARMCI_Group_free(&group);
1947
MPI_Comm_group((MPI_Comm)(igroup->icomm), &group_world);
1948
MPI_Group_incl(group_world, grp_clus_nproc, pid_list, &group);
1950
MPI_Comm_create((MPI_Comm)(igroup->icomm), (MPI_Group)group,
1953
/* Broadcast within the node (for this sub group of processes) */
1954
ARMCI_Bcast_(buf, len, root, comm);
1957
MPI_Comm_free(&comm); /* free the temporary communicator */
1958
MPI_Group_free(&group);
1963
/* to avoid warning */
1964
extern int ARMCI_Absolute_id(ARMCI_Group *group,int group_rank);
1966
void armci_msg_group_bintree(int scope, int* Root, int *Up, int *Left, int *Right,
1969
int root, up, left, right, index, nproc,grp_clus_me,grp_me,grp_master,grp_nproc;
1970
armci_grp_attr_t *grp_attr=ARMCI_Group_getattr(group);
1971
grp_me = grp_attr->grp_me;
1972
grp_clus_me = grp_attr->grp_clus_me;
1973
grp_master = grp_attr->grp_clus_info[grp_clus_me].master;
1974
ARMCI_Group_size(group, &grp_nproc);
1975
if(scope == SCOPE_NODE){
1976
root = grp_attr->grp_clus_info[grp_clus_me].master;
1977
nproc = grp_attr->grp_clus_info[grp_clus_me].nslave;
1978
index = grp_me - root;
1979
up = (index-1)/2 + root; if( up < root) up = -1;
1980
left = 2*index + 1 + root; if(left >= root+nproc) left = -1;
1981
right = 2*index + 2 + root; if(right >= root+nproc)right = -1;
1982
}else if(scope ==SCOPE_MASTERS){
1983
root = grp_attr->grp_clus_info[0].master;
1984
nproc = grp_attr->grp_nclus;
1985
if(grp_me != grp_master){up = -1; left = -1; right = -1; }
1987
index = grp_clus_me - root;
1988
up = (index-1)/2 + root;
1989
up = ( up < root)? -1: grp_attr->grp_clus_info[up].master;
1990
left = 2*index + 1 + root;
1991
left =( left >= root+nproc)?-1:grp_attr->grp_clus_info[left].master;
1992
right= 2*index + 2 + root;
1993
right=( right>=root+nproc)?-1:grp_attr->grp_clus_info[right].master;
1998
index = grp_me - root;
1999
up = (index-1)/2 + root; if( up < root) up = -1;
2000
left = 2*index + 1 + root; if(left >= root+nproc) left = -1;
2001
right = 2*index + 2 + root; if(right >= root+nproc)right = -1;
2004
*Up = (up==-1)?up:ARMCI_Absolute_id(group,up);
2005
*Left = (left==-1)?left:ARMCI_Absolute_id(group,left);
2006
*Right = (right==-1)?right:ARMCI_Absolute_id(group,right);
2007
*Root = (root==-1)?root:ARMCI_Absolute_id(group,root);
2010
void armci_msg_group_bcast_scope(int scope, void *buf, int len, int root,
2013
int up, left, right, Root;
2015
ARMCI_iGroup *igroup = armci_get_igroup_from_group(group);
2017
if(!buf)armci_die("armci_msg_bcast: NULL pointer", len);
2019
if(!group)armci_msg_bcast_scope(scope,buf,len,root);
2020
else grp_me = igroup->grp_attr.grp_me;
2021
armci_msg_group_bintree(scope, &Root, &up, &left, &right,group);
2024
if(armci_me == root) armci_msg_snd(ARMCI_TAG, buf,len, Root);
2025
if(armci_me ==Root) armci_msg_rcv(ARMCI_TAG, buf, len, NULL, root);
2028
/* printf("%d: scope=%d left=%d right=%d up=%d\n",armci_me, scope,
2031
if(armci_me != Root && up!=-1) armci_msg_rcv(ARMCI_TAG, buf, len, NULL, up);
2032
if (left > -1) armci_msg_snd(ARMCI_TAG, buf, len, left);
2033
if (right > -1) armci_msg_snd(ARMCI_TAG, buf, len, right);
2037
armci_msg_group_gop_scope(int scope, void *x, int n, char* op, int type,
2040
int root, up, left, right, size;
2041
int tag=ARMCI_TAG,grp_me;
2042
int ndo, len, lenmes, orign =n, ratio;
2044
ARMCI_iGroup *igroup = armci_get_igroup_from_group(group);
2046
if(!group)armci_msg_gop_scope(scope,x,n,op,type);
2047
else grp_me = igroup->grp_attr.grp_me;
2048
if(!x)armci_die("armci_msg_gop: NULL pointer", n);
2049
if(work==NULL)_allocate_mem_for_work();
2051
armci_msg_group_bintree(scope, &root, &up, &left, &right,group);
2053
if(type==ARMCI_INT) size = sizeof(int);
2054
else if(type==ARMCI_LONG) size = sizeof(long);
2055
else if(type==ARMCI_LONG_LONG) size = sizeof(long long);
2056
else if(type==ARMCI_FLOAT) size = sizeof(float);
2057
else size = sizeof(double);
2059
ratio = sizeof(double)/size;
2061
while ((ndo = (n<=BUF_SIZE*ratio) ? n : BUF_SIZE*ratio)) {
2062
len = lenmes = ndo*size;
2065
armci_msg_rcv(tag, lwork, len, &lenmes, left);
2066
if(type==ARMCI_INT) idoop(ndo, op, (int*)x, iwork);
2067
else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, lwork);
2068
else if(type==ARMCI_LONG_LONG) lldoop(ndo, op, (long long*)x,llwork);
2069
else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, fwork);
2070
else ddoop(ndo, op, (double*)x, work);
2074
armci_msg_rcv(tag, lwork, len, &lenmes, right);
2075
if(type==ARMCI_INT) idoop(ndo, op, (int*)x, iwork);
2076
else if(type==ARMCI_LONG) ldoop(ndo, op, (long*)x, lwork);
2077
else if(type==ARMCI_LONG_LONG) lldoop(ndo, op,(long long*)x, llwork);
2078
else if(type==ARMCI_FLOAT) fdoop(ndo, op, (float*)x, fwork);
2079
else ddoop(ndo, op, (double*)x, work);
2081
if (armci_me != root && up!=-1) armci_msg_snd(tag, x, len, up);
2087
/* Now, root broadcasts the result down the binary tree */
2089
armci_msg_group_bcast_scope(scope, origx, len, root,group);
2092
void armci_exchange_address_grp(void *ptr_arr[], int n, ARMCI_Group *group)
2094
int ratio = sizeof(void*)/sizeof(int);
2095
ARMCI_iGroup *igroup = armci_get_igroup_from_group(group);
2096
int grp_me = igroup->grp_attr.grp_me;
2098
printf("%d: exchanging %ld ratio=%d\n",armci_me,
2099
(long)ptr_arr[grp_me], ratio);
2101
armci_msg_group_gop_scope(SCOPE_ALL, ptr_arr, n*ratio,
2102
"+", ARMCI_INT, group);
2105
/*\ combine array of longs/ints/doubles accross all processes
2107
void armci_msg_group_igop(int *x, int n, char* op, ARMCI_Group *group)
2108
{ armci_msg_group_gop_scope(SCOPE_ALL,x, n, op, ARMCI_INT,group); }
2110
void armci_msg_group_lgop(long *x, int n, char* op,ARMCI_Group *group)
2111
{ armci_msg_group_gop_scope(SCOPE_ALL,x, n, op, ARMCI_LONG,group); }
2113
void armci_msg_group_llgop(long long *x, int n, char* op,ARMCI_Group *group)
2114
{ armci_msg_group_gop_scope(SCOPE_ALL,x, n, op, ARMCI_LONG_LONG,group); }
2116
void armci_msg_group_fgop(float *x, int n, char* op,ARMCI_Group *group)
2117
{ armci_msg_group_gop_scope(SCOPE_ALL,x, n, op, ARMCI_FLOAT,group); }
2119
void armci_msg_group_dgop(double *x, int n, char* op,ARMCI_Group *group)
2120
{ armci_msg_group_gop_scope(SCOPE_ALL,x, n, op, ARMCI_DOUBLE,group); }
2122
# endif /* ifdef MPI */
2123
/*********************** End ARMCI Groups Code ****************************/
2127
/* set the group name if using PVM */
2128
void ARMCI_PVM_Init(char *mpgroup)
2131
mp_group_name = (char *)NULL;
2133
if(mpgroup != NULL) {
2134
/* free(mp_group_name); */
2135
mp_group_name = (char *)malloc(25 * sizeof(char));
2136
strcpy(mp_group_name, mpgroup);