1
/* $Id: test.c,v 1.43.6.6 2007-08-30 22:59:27 manoj Exp $ */
8
# define sleep(x) Sleep(1000*(x))
13
/* ARMCI is impartial to message-passing libs - we handle them with MP macros */
17
# define MPGROUP (char *)NULL
18
# define MP_INIT(arc,argv)
20
# define MPGROUP "mp_working_group"
21
# define MP_INIT(arc,argv) pvm_init(arc, argv)
23
# define MP_FINALIZE() pvm_exit()
24
# define MP_BARRIER() pvm_barrier(MPGROUP,-1)
25
# define MP_MYID(pid) *(pid) = pvm_getinst(MPGROUP,pvm_mytid())
26
# define MP_PROCS(pproc) *(pproc) = (int)pvm_gsize(MPGROUP)
27
void pvm_init(int argc, char *argv[]);
31
# define MP_BARRIER() SYNCH_(&tcg_tag)
32
# define MP_INIT(arc,argv) PBEGIN_((argc),(argv))
33
# define MP_FINALIZE() PEND_()
34
# define MP_MYID(pid) *(pid) = (int)NODEID_()
35
# define MP_PROCS(pproc) *(pproc) = (int)NNODES_()
38
# define MP_BARRIER() bgml_barrier(3);
39
# define MP_FINALIZE()
40
# define MP_INIT(arc, argv)
41
# define MP_MYID(pid) *(pid)=BGML_Messager_rank();
42
# define MP_PROCS(pproc) *(pproc)=BGML_Messager_size();
45
# define MP_BARRIER() MPI_Barrier(MPI_COMM_WORLD)
46
# define MP_FINALIZE() MPI_Finalize()
48
# define MP_INIT(arc,argv) \
49
int desired = MPI_THREAD_MULTIPLE; \
51
printf("using MPI_Init_thread\n"); \
52
MPI_Init_thread(&argc, &argv, desired, &provided); \
53
if ( provided == MPI_THREAD_MULTIPLE ) printf("provided = MPI_THREAD_MULTIPLE\n"); \
54
else if ( provided == MPI_THREAD_SERIALIZED ) printf("provided = MPI_THREAD_SERIALIZED\n"); \
55
else if ( provided == MPI_THREAD_FUNNELED ) printf("provided = MPI_THREAD_FUNNELED\n"); \
56
else if ( provided == MPI_THREAD_SINGLE ) printf("provided = MPI_THREAD_SINGLE\n");
58
# define MP_INIT(arc,argv) MPI_Init(&(argc),&(argv))
60
# define MP_MYID(pid) MPI_Comm_rank(MPI_COMM_WORLD, (pid))
61
# define MP_PROCS(pproc) MPI_Comm_size(MPI_COMM_WORLD, (pproc));
69
/* Solaris has shared memory shortages in the default system configuration */
73
#elif defined(__alpha__)
87
#define EDIM1 (DIM1+OFF)
88
#define EDIM2 (DIM2+OFF)
89
#define EDIM3 (DIM3+OFF)
90
#define EDIM4 (DIM4+OFF)
91
#define EDIM5 (DIM5+OFF)
92
#define EDIM6 (DIM6+OFF)
93
#define EDIM7 (DIM7+OFF)
97
#define MAX_DIM_VAL 50
111
/***************************** macros ************************/
112
#define COPY(src, dst, bytes) memcpy((dst),(src),(bytes))
113
#define ARMCI_MAX(a,b) (((a) >= (b)) ? (a) : (b))
114
#define ARMCI_MIN(a,b) (((a) <= (b)) ? (a) : (b))
115
#define ARMCI_ABS(a) (((a) <0) ? -(a) : (a))
117
/***************************** global data *******************/
119
void* work[MAXPROC]; /* work array for propagating addresses */
124
void pvm_init(int argc, char *argv[])
126
int mytid, mygid, ctid[MAXPROC];
130
if((argc != 2) && (argc != 1)) goto usage;
131
if(argc == 1) np = 1;
133
if((np = atoi(argv[1])) < 1) goto usage;
134
if(np > MAXPROC) goto usage;
136
mygid = pvm_joingroup(MPGROUP);
140
i = pvm_spawn(argv[0], argv+1, 0, "", np-1, ctid);
142
while(pvm_gsize(MPGROUP) < np) sleep(1);
145
pvm_barrier(MPGROUP, np);
147
printf("PVM initialization done!\n");
152
fprintf(stderr, "usage: %s <nproc>\n", argv[0]);
158
/*\ generate random range for a section of multidimensional array
160
void get_range(int ndim, int dims[], int lo[], int hi[])
163
for(dim=0; dim <ndim;dim++){
165
toss1 = rand()%dims[dim];
166
toss2 = rand()%dims[dim];
179
/*\ generates a new random range similar to the input range for an array with specified dimensions
181
void new_range(int ndim, int dims[], int lo[], int hi[],int new_lo[], int new_hi[])
184
for(dim=0; dim <ndim;dim++){
186
int diff = hi[dim] -lo[dim]+1;
187
assert(diff <= dims[dim]);
188
range = dims[dim]-diff;
189
toss = (range > 0)? rand()%range : lo[dim];
191
new_hi[dim] = toss + diff -1;
192
assert(new_hi[dim] < dims[dim]);
193
assert(diff == (new_hi[dim] -new_lo[dim]+1));
201
/*\ print range of ndim dimensional array with two strings before and after
203
void print_range(char *pre,int ndim, int lo[], int hi[], char* post)
209
printf("%d:%d",lo[i],hi[i]);
210
if(i==ndim-1)printf("] %s",post);
215
/*\ print subscript of ndim dimensional array with two strings before and after
217
void print_subscript(char *pre,int ndim, int subscript[], char* post)
223
printf("%d",subscript[i]);
224
if(i==ndim-1)printf("] %s",post);
230
/*\ print a section of a 2-D array of doubles
232
void print_2D_double(double *a, int ld, int *lo, int *hi)
235
for(i=lo[0];i<=hi[0];i++){
236
for(j=lo[1];j<=hi[1];j++) printf("%13f ",a[ld*j+i]);
242
/*\ initialize array: a[i,j,k,..]=i+100*j+10000*k+ ...
244
void init(double *a, int ndim, int elems, int dims[])
249
for(i=0; i<elems; i++){
253
for(dim = 0; dim < ndim; dim++){
254
idx[dim] = Index%dims[dim];
259
for(dim=0; dim< ndim;dim++){
260
val += field*idx[dim];
264
/* printf("(%d,%d,%d)=%6.0f",idx[0],idx[1],idx[2],val); */
269
/*\ compute Index from subscript
270
* assume that first subscript component changes first
272
int Index(int ndim, int subscript[], int dims[])
274
int idx = 0, i, factor=1;
276
idx += subscript[i]*factor;
283
void update_subscript(int ndim, int subscript[], int lo[], int hi[], int dims[])
287
if(subscript[i] < hi[i]) { subscript[i]++; return; }
288
subscript[i] = lo[i];
294
void compare_patches(double eps, int ndim, double *patch1, int lo1[], int hi1[],
295
int dims1[],double *patch2, int lo2[], int hi2[],
300
int subscr1[MAXDIMS], subscr2[MAXDIMS];
303
for(i=0;i<ndim;i++){ /* count # of elements & verify consistency of both patches */
304
int diff = hi1[i]-lo1[i];
305
assert(diff == (hi2[i]-lo2[i]));
306
assert(diff < dims1[i]);
307
assert(diff < dims2[i]);
314
/* compare element values in both patches */
315
for(j=0; j< elems; j++){
316
int idx1, idx2, offset1, offset2;
318
idx1 = Index(ndim, subscr1, dims1); /* calculate element Index from a subscript */
319
idx2 = Index(ndim, subscr2, dims2);
329
diff = patch1[idx1] - patch2[idx2];
330
max = ARMCI_MAX(ARMCI_ABS(patch1[idx1]),ARMCI_ABS(patch2[idx2]));
331
if(max == 0. || max <eps) max = 1.;
333
if(eps < ARMCI_ABS(diff)/max){
335
sprintf(msg,"(proc=%d):%f",me,patch1[idx1]);
336
print_subscript("ERROR: a",ndim,subscr1,msg);
337
sprintf(msg,"%f\n",patch2[idx2]);
338
print_subscript(" b",ndim,subscr2,msg);
341
ARMCI_Error("Bailing out",0);
344
{ /* update subscript for the patches */
345
update_subscript(ndim, subscr1, lo1,hi1, dims1);
346
update_subscript(ndim, subscr2, lo2,hi2, dims2);
352
/* make sure we reached upper limit */
353
/*for(i=0;i<ndim;i++){
354
assert(subscr1[i]==hi1[i]);
355
assert(subscr2[i]==hi2[i]);
360
void scale_patch(double alpha, int ndim, double *patch1, int lo1[], int hi1[], int dims1[])
363
int subscr1[MAXDIMS];
365
for(i=0;i<ndim;i++){ /* count # of elements in patch */
366
int diff = hi1[i]-lo1[i];
367
assert(diff < dims1[i]);
372
/* scale element values in both patches */
373
for(j=0; j< elems; j++){
376
idx1 = Index(ndim, subscr1, dims1); /* calculate element Index from a subscript */
383
patch1[idx1] *= alpha;
384
update_subscript(ndim, subscr1, lo1,hi1, dims1);
389
/* #define NEWMALLOC */
391
armci_meminfo_t meminfo[MMAX][MAXPROC];
395
void create_array(void *a[], int elem_size, int ndim, int dims[])
397
int bytes=elem_size, i, rc;
399
assert(ndim<=MAXDIMS);
400
for(i=0;i<ndim;i++)bytes*=dims[i];
403
if(g_idx>=100) ARMCI_Error("increase MMAX", g_idx);
404
ARMCI_Memget(bytes, &meminfo[g_idx][me], 0);
406
for(i=0; i<nproc; i++)
407
armci_msg_brdcst(&meminfo[g_idx][i], sizeof(armci_meminfo_t), i);
409
for(i=0; i<nproc; i++)
410
a[i] = ARMCI_Memat(&meminfo[g_idx][i], 0);
414
rc = ARMCI_Malloc(a, bytes);
421
void destroy_array(void *ptr[])
425
assert(!ARMCI_Free(ptr[me]));
430
int loA[MAXDIMS], hiA[MAXDIMS];
431
int dimsA[MAXDIMS]={DIM1,DIM2,DIM3,DIM4,DIM5,DIM6, DIM7};
432
int loB[MAXDIMS], hiB[MAXDIMS];
433
int dimsB[MAXDIMS]={EDIM1,EDIM2,EDIM3,EDIM4,EDIM5,EDIM6,EDIM7};
435
int strideA[MAXDIMS], strideB[MAXDIMS];
436
int loC[MAXDIMS], hiC[MAXDIMS];
437
int idx[MAXDIMS]={0,0,0,0,0,0,0};
440
void test_dim(int ndim)
444
/* double a[DIM4][DIM3][DIM2][DIM1], b[EDIM4][EDIM3][EDIM2][EDIM1];*/
449
strideA[0]=sizeof(double);
450
strideB[0]=sizeof(double);
452
strideA[i] *= dimsA[i];
453
strideB[i] *= dimsB[i];
455
strideA[i+1] = strideA[i];
456
strideB[i+1] = strideB[i];
461
/* create shared and local arrays */
462
create_array(b, sizeof(double),ndim,dimsB);
463
a = malloc(sizeof(double)*elems);
465
c = malloc(sizeof(double)*elems);
468
init(a, ndim, elems, dimsA);
471
printf("--------array[%d",dimsA[0]);
472
for(dim=1;dim<ndim;dim++)printf(",%d",dimsA[dim]);
473
printf("]--------\n");
480
int idx1, idx2, idx3;
481
get_range(ndim, dimsA, loA, hiA);
482
new_range(ndim, dimsB, loA, hiA, loB, hiB);
483
new_range(ndim, dimsA, loA, hiA, loC, hiC);
488
print_range("local",ndim,loA, hiA,"-> ");
489
print_range("remote",ndim,loB, hiB,"-> ");
490
print_range("local",ndim,loC, hiC,"\n");
493
idx1 = Index(ndim, loA, dimsA);
494
idx2 = Index(ndim, loB, dimsB);
495
idx3 = Index(ndim, loC, dimsA);
497
for(j=0;j<ndim;j++)count[j]=hiA[j]-loA[j]+1;
499
count[0] *= sizeof(double); /* convert range to bytes at stride level zero */
501
(void)ARMCI_PutS((double*)a + idx1, strideA, (double*)b[proc] + idx2, strideB, count, ndim-1, proc);
505
/* printf("%d: a=(%x,%f) b=(%x,%f)\n",me,idx1 + (double*)a,*(idx1 + (double*)a),idx2 + (double*)b,*(idx2 + (double*)b));*/
509
/* note that we do not need ARMCI_Fence here since
510
* consectutive operations targeting the same process are ordered */
511
(void)ARMCI_GetS((double*)b[proc] + idx2, strideB, (double*)c + idx3, strideA, count, ndim-1, proc);
513
compare_patches(0., ndim, (double*)a+idx1, loA, hiA, dimsA, (double*)c+idx3, loC, hiC, dimsA);
523
int nloA[MAXDIMS+1][MAXDIMS], nhiA[MAXDIMS+1][MAXDIMS];
524
int nloB[MAXDIMS+1][MAXDIMS], nhiB[MAXDIMS+1][MAXDIMS];
525
int nloC[MAXDIMS+1][MAXDIMS], nhiC[MAXDIMS+1][MAXDIMS];
527
int get_next_RRproc(int initialize,int ndim){
532
if((nproc%2)!=0)distance++;
533
if(nproc==1)distance=0;
536
/*send it to a different process everytime*/
537
proc=(me<=((nproc%2==0)?((nproc/2)-1):(nproc/2)))?(me+distance):(me-distance);
538
if((nproc%2)!=0 && me==(nproc/2))proc=me;
542
if((me+distance)>=nproc){
544
if((nproc%2)!=0)distance++;
550
if((me-distance)>=(nproc/2)){
552
if((nproc%2)!=0)distance++;
553
distance=distance+(me-distance);
556
if(ndim!=1 && MAXDIMS>nproc && (ndim%(nproc/2)==0)){
558
if((nproc%2)!=0)distance++;
566
int elems=1,elems1=1;
567
int i,j, proc,ndim,rc;
568
void *b[MAXDIMS+1][MAXPROC];
569
void *a[MAXDIMS+1], *c[MAXDIMS+1];
570
armci_hdl_t hdl_put[MAXDIMS+1],hdl_get[MAXDIMS+1];
571
int idx1=0, idx2=0, idx3=0;
572
/* create shared and local arrays */
573
for(ndim=1;ndim<=MAXDIMS;ndim++){
574
elems1*= dimsB[ndim-1];
575
elems *= dimsA[ndim-1];
576
rc = ARMCI_Malloc(b[ndim], sizeof(double)*elems1);
577
assert(rc==0);assert(b[ndim][me]);
578
a[ndim] = malloc(sizeof(double)*elems);
580
c[ndim] = malloc(sizeof(double)*elems);
582
init(a[ndim], ndim, elems, dimsA);
583
ARMCI_INIT_HANDLE(hdl_put+ndim);
584
ARMCI_INIT_HANDLE(hdl_get+ndim);
589
(void)get_next_RRproc(1,0);
590
for(ndim=1;ndim<=MAXDIMS;ndim++){
591
strideA[0]=sizeof(double);
592
strideB[0]=sizeof(double);
594
strideA[i] *= dimsA[i];
595
strideB[i] *= dimsB[i];
597
strideA[i+1] = strideA[i];
598
strideB[i+1] = strideB[i];
601
proc=get_next_RRproc(0,ndim);
602
get_range(ndim, dimsA, nloA[ndim], nhiA[ndim]);
603
new_range(ndim, dimsB, nloA[ndim], nhiA[ndim], nloB[ndim],
605
new_range(ndim, dimsA, nloA[ndim], nhiA[ndim], nloC[ndim],
608
print_range("local",ndim,nloA[ndim], nhiA[ndim],"-> ");
609
print_range("remote",ndim,nloB[ndim], nhiB[ndim],"-> ");
610
print_range("local",ndim,nloC[ndim], nhiC[ndim],"\n");
611
fflush(stdout); sleep(1);
614
idx1 = Index(ndim, nloA[ndim], dimsA);
615
idx2 = Index(ndim, nloB[ndim], dimsB);
616
idx3 = Index(ndim, nloC[ndim], dimsA);
617
for(j=0;j<ndim;j++)count[j]=nhiA[ndim][j]-nloA[ndim][j]+1;
618
count[0] *= sizeof(double);
621
(void)ARMCI_NbPut((double*)a[ndim]+idx1,(double*)b[ndim][proc]+idx2,
622
count[0], proc, (hdl_put+ndim));
624
(void)ARMCI_NbPutS((double*)a[ndim]+idx1,strideA,
625
(double*)b[ndim][proc]+idx2,
626
strideB, count, ndim-1, proc,(hdl_put+ndim));
631
/*before we do gets, we have to make sure puts are complete
632
on the remote processor*/
633
for(ndim=1;ndim<=MAXDIMS;ndim++)
634
ARMCI_Wait(hdl_put+ndim);
638
(void)get_next_RRproc(1,0);
640
for(ndim=1;ndim<=MAXDIMS;ndim++){
641
strideA[0]=sizeof(double);
642
strideB[0]=sizeof(double);
644
strideA[i] *= dimsA[i];
645
strideB[i] *= dimsB[i];
647
strideA[i+1] = strideA[i];
648
strideB[i+1] = strideB[i];
651
/*send it to a different process everytime*/
652
proc=get_next_RRproc(0,ndim);
654
idx1 = Index(ndim, nloA[ndim], dimsA);
655
idx2 = Index(ndim, nloB[ndim], dimsB);
656
idx3 = Index(ndim, nloC[ndim], dimsA);
657
for(j=0;j<ndim;j++)count[j]=nhiA[ndim][j]-nloA[ndim][j]+1;
658
count[0] *= sizeof(double);
660
(void)ARMCI_NbGet((double*)b[ndim][proc]+idx2,(double*)c[ndim]+idx3,
661
count[0], proc, (hdl_get+ndim));
663
(void)ARMCI_NbGetS((double*)b[ndim][proc]+idx2,strideB,
664
(double*)c[ndim]+idx3,
665
strideA, count, ndim-1, proc,(hdl_get+ndim));
671
printf("Now waiting for all non-blocking calls and verifying data...\n");
674
for(ndim=1;ndim<=MAXDIMS;ndim++){
675
ARMCI_Wait(hdl_get+ndim);
676
idx1 = Index(ndim, nloA[ndim], dimsA);
677
idx2 = Index(ndim, nloB[ndim], dimsB);
678
idx3 = Index(ndim, nloC[ndim], dimsA);
679
compare_patches(0.,ndim,(double*)a[ndim]+idx1,nloA[ndim],nhiA[ndim],
680
dimsA,(double*)c[ndim]+idx3,nloC[ndim],nhiC[ndim],dimsA);
687
for(ndim=1;ndim<=MAXDIMS;ndim++){
688
destroy_array(b[ndim]);
694
#define PTR_ARR_LEN 10
696
#define VEC_ELE_LEN 20 /*number of doubles in each dimention*/
697
#define GIOV_ARR_LEN 9
699
void verify_vector_data(double *data,int procs,int isput,int datalen)
702
int i,j=0,k=0,kc=0,dst=0;
704
for(i=0;i<datalen;i++){
706
if(ARMCI_ABS((data[i] -(me+facto+dst)*((kc+1)*(j%PTR_ARR_LEN + 1))))>0.001){
707
printf("\n%d:while verifying data of a op from proc=%d ",me,dst);
708
printf("giov index=%d ptr_arr_index=%d \n :element index=%d",kc,
710
printf(" elem was supposed to be %f but is %f",
711
(me+facto+dst)*((kc+1)*(j%PTR_ARR_LEN + 1)) ,data[i]);
714
ARMCI_Error("vector non-blocking failed",0);
719
if(j%PTR_ARR_LEN==0){
721
if((kc%GIOV_ARR_LEN)==0){kc=0;dst++;}
727
void test_vec_small()
731
armci_giov_t dsc[MAXPROC*GIOV_ARR_LEN];
732
void **psrc; /*arrays of pointers to be used by giov_t*/
734
void *getsrc[MAXPROC]; /*to allocate mem via armci_malloc*/
735
void *putdst[MAXPROC]; /*to allocate mem via armci_malloc*/
736
armci_hdl_t hdl_put[MAXPROC],hdl_get[MAXPROC];
737
int i=0,j=0,k=0,kc=0,kcold=0,rc,dstproc,dst=0;
740
lenpergiov = PTR_ARR_LEN*VEC_ELE_LEN;
741
rc = ARMCI_Malloc(getsrc,sizeof(double)*nproc*GIOV_ARR_LEN*lenpergiov);
742
assert(rc==0);assert(getsrc[me]);
743
rc = ARMCI_Malloc(putdst,sizeof(double)*nproc*GIOV_ARR_LEN*lenpergiov);
744
assert(rc==0);assert(putdst[me]);
746
/*first malloc for getdst and putsrc, both are 2d arrays*/
747
getdst = (double *)malloc(sizeof(double)*nproc*GIOV_ARR_LEN*lenpergiov);
748
putsrc = (double **)malloc(sizeof(double *)*nproc*GIOV_ARR_LEN*PTR_ARR_LEN);
749
assert(getdst);assert(putsrc);
750
for(i=0;i<nproc*GIOV_ARR_LEN*PTR_ARR_LEN;i++){
751
putsrc[i]=(double *)malloc(sizeof(double)*VEC_ELE_LEN);
754
/*allocating memory for psrc and pdst*/
755
psrc = (void **)malloc(sizeof(void *)*PTR_ARR_LEN * nproc*GIOV_ARR_LEN);
756
pdst = (void **)malloc(sizeof(void *)*PTR_ARR_LEN * nproc*GIOV_ARR_LEN);
757
assert(pdst);assert(psrc);
759
for(i=0;i<nproc*lenpergiov*GIOV_ARR_LEN;i++){
760
putsrc[j][k] =(me+1.89+dst)*((kc+1)*((j%PTR_ARR_LEN) + 1));
761
((double *)getsrc[me])[i]=(me+2.89+dst)*((kc+1)*(j%PTR_ARR_LEN + 1));
765
if((j%PTR_ARR_LEN)==0){
767
if((kc%GIOV_ARR_LEN)==0){kc=0;dst++;}
771
/*********************Testing NbPutV*********************************/
774
for(i=0;i<nproc-1;i++){
775
dstproc++;if(dstproc==nproc)dstproc=0;
776
for(j=0; j <GIOV_ARR_LEN; j++){
778
for(k=0;k<PTR_ARR_LEN;k++,kc++){
780
psrc[kc]=(void *)putsrc[PTR_ARR_LEN*(dstproc*GIOV_ARR_LEN+j)+k];
781
ptr = (double *)putdst[dstproc];
782
pdst[kc]=(void *)(ptr+lenpergiov*(GIOV_ARR_LEN*me+j)+k*VEC_ELE_LEN);
784
dsc[j].bytes = VEC_ELE_LEN*sizeof(double);
785
dsc[j].src_ptr_array = &psrc[kcold];
786
dsc[j].dst_ptr_array = &pdst[kcold];
787
dsc[j].ptr_array_len = PTR_ARR_LEN;
789
ARMCI_INIT_HANDLE(hdl_put+dstproc);
790
if((rc=ARMCI_NbPutV(dsc,GIOV_ARR_LEN,dstproc,hdl_put+dstproc)))
791
ARMCI_Error("putv failed",rc);
794
printf("\n\tNow veryfying the vector put data for correctness");
796
for(i=0;i<nproc;i++)if(i!=me)ARMCI_Wait(hdl_put+i);
799
ARMCI_AllFence();/*every one syncs after put */
800
verify_vector_data((double *)putdst[me],nproc,1,nproc*GIOV_ARR_LEN*lenpergiov);
802
printf("\n\tPuts OK\n");
804
/****************Done Testing NbPutV*********************************/
806
/*********************Testing NbGetV*********************************/
809
for(i=0;i<nproc-1;i++){
810
dstproc++;if(dstproc==nproc)dstproc=0;
811
for(j=0; j <GIOV_ARR_LEN; j++){
813
for(k=0;k<PTR_ARR_LEN;k++,kc++){
816
pdst[kc]=(void *)(ptr+lenpergiov*(dstproc*GIOV_ARR_LEN+j)+k*VEC_ELE_LEN);
817
ptr = (double *)(getsrc[dstproc]);
818
psrc[kc]=(void *)(ptr+lenpergiov*(me*GIOV_ARR_LEN+j)+k*VEC_ELE_LEN);
820
dsc[j].bytes = VEC_ELE_LEN*sizeof(double);
821
dsc[j].src_ptr_array = &psrc[kcold];
822
dsc[j].dst_ptr_array = &pdst[kcold];
823
dsc[j].ptr_array_len = PTR_ARR_LEN;
825
ARMCI_INIT_HANDLE(hdl_get+dstproc);
826
if((rc=ARMCI_NbGetV(dsc,GIOV_ARR_LEN,dstproc,hdl_get+dstproc)))
827
ARMCI_Error("putv failed",rc);
830
printf("\n\tNow veryfying the vector get data for correctness");
832
for(i=0;i<nproc;i++)if(i!=me)ARMCI_Wait(hdl_get+i);
835
verify_vector_data((double *)getdst,nproc,0,nproc*GIOV_ARR_LEN*lenpergiov);
837
printf("\n\tGets OK\n");
839
/****************Done Testing NbGetV*********************************/
843
for(i=0;i<nproc*GIOV_ARR_LEN*PTR_ARR_LEN;i++) free( putsrc[i]);
849
void GetPermutedProcList(int* ProcList)
853
if(nproc > MAXPROC) ARMCI_Error("permute_proc: nproc to big ", nproc);
855
/* initialize list */
856
for(i=0; i< nproc; i++) ProcList[i]=i;
857
if(nproc ==1) return;
859
/* every process generates different random sequence */
860
(void)srand((unsigned)me);
862
/* list permutation generated by random swapping */
863
for(i=0; i< nproc; i++){
864
iswap = (int)(rand() % nproc);
865
temp = ProcList[iswap];
866
ProcList[iswap] = ProcList[i];
873
/*\ Atomic Accumulate test: remote += alpha*local
874
* Every process/or has its patch of array b updated TIMES*NPROC times.
875
* The sequence of updates is random: everybody uses a randomly permuted list
876
* and accumulate is non-collective (of-course)
878
void test_acc(int ndim)
884
double alpha=0.1, scale;
886
int *proclist = (int*)work;
889
strideA[0]=sizeof(double);
890
strideB[0]=sizeof(double);
892
strideA[i] *= dimsA[i];
893
strideB[i] *= dimsB[i];
895
strideA[i+1] = strideA[i];
896
strideB[i+1] = strideB[i];
900
/* set up patch coordinates: same on every processor */
905
count[i]=hiA[i]-loA[i]+1;
908
/* create shared and local arrays */
909
create_array(b, sizeof(double),ndim,dimsB);
910
a = malloc(sizeof(double)*elems);
912
c = malloc(sizeof(double)*elems);
915
init(a, ndim, elems, dimsA);
918
printf("--------array[%d",dimsA[0]);
919
for(dim=1;dim<ndim;dim++)printf(",%d",dimsA[dim]);
920
printf("]--------\n");
923
GetPermutedProcList(proclist);
925
idx1 = Index(ndim, loA, dimsA);
926
idx2 = Index(ndim, loB, dimsB);
927
count[0] *= sizeof(double); /* convert range to bytes at stride level zero */
929
/* initialize all elements of array b to zero */
931
for(i=0;i<ndim;i++)elems *= dimsB[i];
932
for(i=0;i<elems;i++)((double*)b[me])[i]=0.;
937
print_range("patch",ndim,loA, hiA," -> ");
938
print_range("patch",ndim,loB, hiB,"\n");
944
for(i=0;i<TIMES*nproc;i++){
945
proc=proclist[i%nproc];
946
(void)ARMCI_AccS(ARMCI_ACC_DBL,&alpha,(double*)a + idx1, strideA,
947
(double*)b[proc] + idx2, strideB, count, ndim-1, proc);
954
/* copy my patch into local array c */
955
(void)ARMCI_GetS((double*)b[me] + idx2, strideB, (double*)c + idx1, strideA, count, ndim-1, me);
957
scale = alpha*TIMES*nproc;
959
scale_patch(scale, ndim, (double*)a+idx1, loA, hiA, dimsA);
961
compare_patches(.0001, ndim, (double*)a+idx1, loA, hiA, dimsA, (double*)c+idx1, loA, hiA, dimsA);
975
/*************************** vector interface *********************************\
976
* tests vector interface for transfers of triangular sections of a 2-D array *
977
******************************************************************************/
980
int dim,elems,ndim,cols,rows,mrc;
986
armci_giov_t dsc[MAX_DIM_VAL];
987
void *psrc[MAX_DIM_VAL];
988
void *pdst[MAX_DIM_VAL];
993
dimsA[i]=MAX_DIM_VAL;
994
dimsB[i]=MAX_DIM_VAL+1;
998
/* create shared and local arrays */
999
create_array(b, sizeof(double),ndim,dimsB);
1000
a = malloc(sizeof(double)*elems);
1002
c = malloc(sizeof(double)*elems);
1005
init(a, ndim, elems, dimsA);
1008
printf("--------array[%d",dimsA[0]);
1009
for(dim=1;dim<ndim;dim++)printf(",%d",dimsA[dim]);
1010
printf("]--------\n");
1014
for(loop=0;loop<LOOP;loop++){
1015
get_range(ndim, dimsA, loA, hiA);
1016
new_range(ndim, dimsB, loA, hiA, loB, hiB);
1017
new_range(ndim, dimsA, loA, hiA, loC, hiC);
1022
print_range("local",ndim,loA, hiA,"-> ");
1023
print_range("remote",ndim,loB, hiB,"-> ");
1024
print_range("local",ndim,loC, hiC,"\n");
1027
/* printf("array at source\n");*/
1028
/* print_2D_double((double *)a, dimsA[0], loA, hiA);*/
1030
cols = hiA[1]-loA[1]+1;
1031
rows = hiA[0]-loA[0]+1;
1032
mrc =ARMCI_MIN(cols,rows);
1034
/* generate a data descriptor for a lower-triangular patch */
1035
for(i=0; i < mrc; i++){
1041
idx = Index(ndim, ij, dimsA);
1042
psrc[i]= (double*)a + idx;
1046
idx = Index(ndim, ij, dimsB);
1047
pdst[i]= (double*)b[proc] + idx;
1049
dsc[i].bytes = (rows-i)*sizeof(double);
1050
dsc[i].src_ptr_array = &psrc[i];
1051
dsc[i].dst_ptr_array = &pdst[i];
1053
/* assume each element different in size (not true in rectangular patches) */
1054
dsc[i].ptr_array_len = 1;
1057
if((rc=ARMCI_PutV(dsc, mrc, proc)))ARMCI_Error("putv failed ",rc);
1059
/* printf("array at destination\n");*/
1060
/* print_2D_double((double *)b[proc], dimsB[0], loB, hiB);*/
1062
/* generate a data descriptor for the upper-triangular patch */
1063
/* there is one less element since diagonal is excluded */
1064
for(i=1; i < cols; i++){
1069
psrc[i-1]= (double*)a + Index(ndim, ij, dimsA);
1073
pdst[i-1]= (double*)b[proc] + Index(ndim, ij, dimsB);
1075
mrc = ARMCI_MIN(i,rows);
1076
dsc[i-1].bytes = mrc*sizeof(double);
1077
dsc[i-1].src_ptr_array = &psrc[i-1];
1078
dsc[i-1].dst_ptr_array = &pdst[i-1];
1080
/* assume each element different in size (not true in rectangular patches) */
1081
dsc[i-1].ptr_array_len = 1;
1084
if((cols-1))if((rc=ARMCI_PutV(dsc, cols-1, proc)))
1085
ARMCI_Error("putv(2) failed ",rc);
1087
/* we get back entire rectangular patch */
1088
for(i=0; i < cols; i++){
1092
psrc[i]= (double*)b[proc] + Index(ndim, ij, dimsB);
1096
pdst[i]= (double*)c + Index(ndim, ij, dimsA);
1099
dsc[0].bytes = rows*sizeof(double);
1100
dsc[0].src_ptr_array = psrc;
1101
dsc[0].dst_ptr_array = pdst;
1102
dsc[0].ptr_array_len = cols;
1104
/* note that we do not need ARMCI_Fence here since
1105
* consecutive operations targeting the same process are ordered */
1106
if((rc=ARMCI_GetV(dsc, 1, proc)))ARMCI_Error("getv failed ",rc);
1108
idx1 = Index(ndim, loA, dimsA);
1109
idx3 = Index(ndim, loC, dimsA);
1110
compare_patches(0., ndim, (double*)a+idx1, loA, hiA, dimsA, (double*)c+idx3, loC, hiC, dimsA);
1120
/*\ Atomic Accumulate test for vector API: remote += alpha*local
1121
* Every process/or has its patch of array b updated TIMES*NPROC times.
1122
* The sequence of updates is random: everybody uses a randomly permuted list
1123
* and accumulate is non-collective (of-course)
1125
void test_vector_acc()
1127
int dim,elems,bytes;
1128
int i, j, proc, rc, one=1;
1130
void *psrc[ELEMS/2], *pdst[ELEMS/2];
1132
double alpha=0.1, scale;
1133
int *proclist = (int*)work;
1138
bytes = sizeof(double)*elems;
1140
/* create shared and local arrays */
1141
create_array(b, sizeof(double),dim,&elems);
1147
init(a, dim, elems, &elems);
1150
printf("--------array[%d",elems);
1151
printf("]--------\n");
1155
GetPermutedProcList(proclist);
1157
/* initialize all elements of array b to zero */
1158
for(i=0;i<elems;i++)((double*)b[me])[i]=0.;
1162
dsc.bytes = sizeof(double);
1163
dsc.src_ptr_array = psrc;
1164
dsc.dst_ptr_array = pdst;
1165
dsc.ptr_array_len = elems/2;
1169
for(i=0;i<TIMES*nproc;i++){
1171
/* proc=proclist[i%nproc];*/
1174
/* accumulate even numbered elements */
1175
for(j=0; j<elems/2; j++){
1176
psrc[j]= 2*j + (double*)a;
1177
pdst[j]= 2*j + (double*)b[proc];
1179
if((rc = ARMCI_AccV(ARMCI_ACC_DBL, &alpha, &dsc, 1, proc)))
1180
ARMCI_Error("accumlate failed",rc);
1181
/* for(j=0; j<elems; j++)
1182
printf("%d %lf %lf\n",j, *(j+ (double*)b[proc]), *(j+ (double*)a));
1184
/* accumulate odd numbered elements */
1185
for(j=0; j< elems/2; j++){
1186
psrc[j]= 2*j+1 + (double*)a;
1187
pdst[j]= 2*j+1 + (double*)b[proc];
1189
(void)ARMCI_AccV(ARMCI_ACC_DBL, &alpha, &dsc, 1, proc);
1191
/* for(j=0; j<elems; j++)
1192
printf("%d %lf %lf\n",j, *(j+ (double*)a), *(j+ (double*)b[proc]));
1199
/* copy my patch into local array c */
1200
assert(!ARMCI_Get((double*)b[proc], c, bytes, proc));
1202
/* scale = alpha*TIMES*nproc; */
1203
scale = alpha*TIMES*nproc*nproc;
1204
scale_patch(scale, dim, a, &one, &elems, &elems);
1206
compare_patches(.0001, dim, a, &one, &elems, &elems, c, &one, &elems, &elems);
1215
destroy_array((void**)b);
1221
void test_fetch_add()
1223
int rc, bytes, i, val, times =0;
1226
/* shared variable is located on processor 0 */
1227
bytes = me == 0 ? sizeof(int) : 0;
1229
rc = ARMCI_Malloc((void**)arr,bytes);
1233
if(me == 0) *arr[0] = 0; /* initialization */
1237
/* show what everybody gets */
1238
rc = ARMCI_Rmw(ARMCI_FETCH_AND_ADD, &val, arr[0], 1, 0);
1241
for(i = 0; i< nproc; i++){
1243
printf("process %d got value of %d\n",i,val);
1250
printf("\nIncrement the shared counter until reaches %d\n",LOOP);
1256
/* now increment the counter value until reaches LOOP */
1258
rc = ARMCI_Rmw(ARMCI_FETCH_AND_ADD, &val, arr[0], 1, 0);
1263
for(i = 0; i< nproc; i++){
1265
printf("process %d incremented the counter %d times value=%d\n",i,times,val);
1272
if(me == 0) *arr[0] = 0; /* set it back to 0 */
1274
printf("\nNow everybody increments the counter %d times\n",LOOP);
1281
for(i = 0; i< LOOP; i++){
1282
rc = ARMCI_Rmw(ARMCI_FETCH_AND_ADD, &val, arr[0], 1, 0);
1290
printf("The final value is %d, should be %d.\n\n",*arr[0],LOOP*nproc);
1292
if( *arr[0] != LOOP*nproc) ARMCI_Error("failed ...",*arr[0]);
1295
ARMCI_Free(arr[me]);
1302
int rc, bytes, i, val, whatever=-8999;
1305
/* shared variable is located on processor 0 */
1306
bytes = me == 0 ? sizeof(int) : 0;
1308
rc = ARMCI_Malloc((void**)arr,bytes);
1312
if(me == 0) *arr[0] = 0; /* initialization */
1316
for(i = 0; i< LOOP; i++){
1319
rc = ARMCI_Rmw(ARMCI_SWAP, &val, arr[0], whatever, 0);
1321
}while (val == LOCKED);
1323
rc = ARMCI_Rmw(ARMCI_SWAP, &val, arr[0], whatever, 0);
1332
printf("The final value is %d, should be %d.\n\n",*arr[0],LOOP*nproc);
1334
if( *arr[0] != LOOP*nproc) ARMCI_Error("failed ...",*arr[0]);
1337
ARMCI_Free(arr[me]);
1345
int dim,elems,bytes;
1350
int *proclist = (int*)work;
1352
void *pstart, *pend;
1354
void armci_lockmem(void*, void*, int);
1355
void armci_unlockmem(void);
1360
bytes = elems*sizeof(double);
1362
/* create shared and local arrays */
1363
create_array((void**)b, sizeof(double),dim,&elems);
1364
a = (double*)malloc(bytes);
1366
c = (double*)malloc(bytes);
1369
/* initialize all elements of array b to zero */
1370
for(i=0;i<elems;i++)b[me][i]=-1.;
1375
for(i=0;i<ELEMS/5;i++)a[i]=me;
1378
for(j=0;j<10*TIMES;j++){
1379
for(i=0;i<TIMES*nproc;i++){
1380
first = rand()%(ELEMS/2);
1381
last = first+ELEMS/5 -1;
1382
pstart = b[proc]+first;
1383
pend = b[proc]+last+1;
1384
elems = last -first +1;
1385
bytes = sizeof(double)*elems;
1387
armci_lockmem(pstart,pend,proc);
1388
assert(!ARMCI_Put(a, pstart, bytes, proc));
1389
assert(!ARMCI_Get(pstart, c, bytes, proc));
1390
assert(!ARMCI_Get(pstart, c, bytes, proc));
1392
for(k=0;k<elems;k++)if(a[k]!=c[k]){
1393
printf("%d: error patch (%d:%d) elem=%d val=%f\n",me,first,last,k,c[k]);
1395
ARMCI_Error("failed is ",(int)c[k]);
1399
if(0==me)fprintf(stderr,"done %d\n",j);
1411
destroy_array((void**)b);
1417
int i, elems = nproc, one=1;
1418
int *idst[MAXPROC], idst_get[MAXPROC], *isrc_get[MAXPROC];
1419
long *ldst[MAXPROC], ldst_get[MAXPROC], *lsrc_get[MAXPROC];
1420
float *fdst[MAXPROC], fdst_get[MAXPROC], *fsrc_get[MAXPROC];
1421
double *ddst[MAXPROC], ddst_get[MAXPROC], *dsrc_get[MAXPROC];
1423
create_array((void**)idst, sizeof(int),1, &elems);
1424
create_array((void**)ldst, sizeof(long),1, &elems);
1425
create_array((void**)fdst, sizeof(float),1, &elems);
1426
create_array((void**)ddst, sizeof(double),1, &elems);
1427
create_array((void**)isrc_get, sizeof(int),1, &one);
1428
create_array((void**)lsrc_get, sizeof(long),1, &one);
1429
create_array((void**)fsrc_get, sizeof(float),1, &one);
1430
create_array((void**)dsrc_get, sizeof(double),1, &one);
1432
for(i=0; i<elems; i++) {
1433
idst[me][i]=0; ldst[me][i]=0; fdst[me][i]=0.0; ddst[me][i]=0.0;
1434
idst_get[i]=0; ldst_get[i]=0; fdst_get[i]=0.0; ddst_get[i]=0.0;
1436
isrc_get[me][0]=100*(me+1); lsrc_get[me][0]=100*(me+1);
1437
fsrc_get[me][0]=100.01*(me+1); dsrc_get[me][0]=100.001*(me+1);
1442
for(i=0; i<nproc; i++) {
1443
ARMCI_PutValueInt(10*(me+1), (void *)&idst[i][me], i);
1444
ARMCI_PutValueLong((long)10*(me+1), (void *)&ldst[i][me], i);
1445
ARMCI_PutValueFloat(10.01*(me+1), (void *)&fdst[i][me], i);
1446
ARMCI_PutValueDouble(10.001*(me+1), (void *)&ddst[i][me], i);
1449
for(i=0; i<nproc; i++) {
1450
idst_get[i] = ARMCI_GetValueInt(isrc_get[i], i);
1451
ldst_get[i] = ARMCI_GetValueLong(lsrc_get[i], i);
1452
fdst_get[i] = ARMCI_GetValueFloat(fsrc_get[i], i);
1453
ddst_get[i] = ARMCI_GetValueDouble(dsrc_get[i], i);
1459
if(me==0)printf("int data type: ");
1460
for(i=0; i<elems; i++) {
1461
if(idst[me][i]!=10*(i+1))
1462
ARMCI_Error("Integer register-originated put failed", 0);
1463
if(idst_get[i]!=100*(i+1))
1464
ARMCI_Error("Integer register-originated get failed", 0);
1467
if(me==0)printf("OK\nlong data type: ");
1468
for(i=0; i<elems; i++) {
1469
if(ldst[me][i]!=10*(i+1))
1470
ARMCI_Error("Long register-originated put failed", 0);
1471
if(ldst_get[i]!=100*(i+1))
1472
ARMCI_Error("Long register-originated get failed", 0);
1474
if(me==0)printf("OK\nfloat data type: ");
1475
for(i=0; i<elems; i++) {
1476
if( ARMCI_ABS(fdst[me][i]-10.01*(i+1)) > 0.1)
1477
ARMCI_Error("Float register-originated put failed", 0);
1478
if( ARMCI_ABS(fdst_get[i]-100.01*(i+1)) > 0.1)
1479
ARMCI_Error("Float register-originated get failed", 0);
1481
if(me==0)printf("OK\ndouble data type: ");
1482
for(i=0; i<elems; i++) {
1483
if(ARMCI_ABS(ddst[me][i]-10.001*(i+1)) > 0.1)
1484
ARMCI_Error("Double register-originated put failed",0);
1485
if(ARMCI_ABS(ddst_get[i]-100.001*(i+1)) > 0.1)
1486
ARMCI_Error("Double register-originated get failed",0);
1488
if(me==0){printf("OK\n"); fflush(stdout);}
1494
destroy_array((void **)idst);
1495
destroy_array((void **)ldst);
1496
destroy_array((void **)fdst);
1497
destroy_array((void **)ddst);
1499
destroy_array((void **)isrc_get);
1500
destroy_array((void **)lsrc_get);
1501
destroy_array((void **)fsrc_get);
1502
destroy_array((void **)dsrc_get);
1506
#define MAXELEMS 6400
1507
#define NUMAGG 20 /* NUMAGG < MAXELEMS/10 */
1508
#define MAX_REQUESTS 325 /* MAXELEMS/NUMAGG */
1511
void test_aggregate() {
1513
int i, j, k, rc, bytes, elems[2] = {MAXPROC, MAXELEMS};
1514
double *ddst_put[MAXPROC];
1515
double *ddst_get[MAXPROC];
1516
double *dsrc[MAXPROC];
1517
armci_hdl_t usr_hdl_put[MAXPROC];
1518
armci_hdl_t usr_hdl_get[MAXPROC];
1520
void *src_ptr[MAX_REQUESTS], *dst_ptr[MAX_REQUESTS];
1521
int start = 0, end = 0;
1523
create_array((void**)ddst_put, sizeof(double),2, elems);
1524
create_array((void**)ddst_get, sizeof(double),2, elems);
1525
create_array((void**)dsrc, sizeof(double),1, &elems[1]);
1527
for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1);
1528
for(i=0; i<elems[0]*elems[1]; i++) {
1529
ddst_put[me][i]=0.0;
1530
ddst_get[me][i]=0.0;
1534
for(i=0; i<nproc; i++) ARMCI_INIT_HANDLE(&usr_hdl_put[i]);
1535
for(i=0; i<nproc; i++) ARMCI_INIT_HANDLE(&usr_hdl_get[i]);
1536
for(i=0; i<nproc; i++) ARMCI_SET_AGGREGATE_HANDLE(&usr_hdl_put[i]);
1537
for(i=0; i<nproc; i++) ARMCI_SET_AGGREGATE_HANDLE(&usr_hdl_get[i]);
1539
/* Testing aggregate put */
1540
for(i=0; i<nproc; i++) {
1542
start = 0; end = COUNT*NUMAGG;
1543
for(j=start; j<end; j++) {
1544
bytes = sizeof(double);
1545
ARMCI_NbPutValueDouble(dsrc[me][j], &ddst_put[i][me*elems[1]+j], i,
1549
start = end; end = start + COUNT*NUMAGG;
1550
for(j=start, k=0; j<end; j+=NUMAGG, k++) {
1551
src_ptr[k] = (void *)&dsrc[me][j];
1552
dst_ptr[k] = (void *)&ddst_put[i][me*elems[1]+j];
1554
darr.src_ptr_array = src_ptr;
1555
darr.dst_ptr_array = dst_ptr;
1556
darr.bytes = NUMAGG*sizeof(double);
1557
darr.ptr_array_len = k;
1558
if((rc=ARMCI_NbPutV(&darr, 1, i, &usr_hdl_put[i])))
1559
ARMCI_Error("armci_nbputv failed\n",rc);
1561
start = end; end = start + COUNT*NUMAGG;
1562
for(j=start; j<end; j+=NUMAGG) {
1563
bytes = sizeof(double)*NUMAGG;
1564
if((rc=ARMCI_NbPutS(&dsrc[me][j], NULL, &ddst_put[i][me*elems[1]+j], NULL,
1565
&bytes, 0, i, &usr_hdl_put[i])))
1566
ARMCI_Error("armci_nbputs failed\n",rc);
1569
start = end; end = elems[1];
1570
for(j=start; j<end; j+=NUMAGG) {
1571
bytes = sizeof(double)*NUMAGG;
1572
if((rc=ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1]+j], bytes,
1573
i, &usr_hdl_put[i])))
1574
ARMCI_Error("armci_nbput failed\n",rc);
1579
for(i=0; i<nproc; i++) ARMCI_Wait(&usr_hdl_put[i]);
1581
/* Testing aggregate get */
1582
for(i=0; i<nproc; i++) {
1584
start = 0; end = COUNT*NUMAGG;
1585
for(j=start, k=0; j<end; j+=NUMAGG, k++) {
1586
src_ptr[k] = (void *)&dsrc[i][j];
1587
dst_ptr[k] = (void *)&ddst_get[me][i*elems[1]+j];
1589
darr.src_ptr_array = src_ptr;
1590
darr.dst_ptr_array = dst_ptr;
1591
darr.bytes = NUMAGG*sizeof(double);
1592
darr.ptr_array_len = k;
1593
if((rc=ARMCI_NbGetV(&darr, 1, i, &usr_hdl_get[i])))
1594
ARMCI_Error("armci_nbgetv failed\n", rc);
1596
start = end; end = start + COUNT*NUMAGG;
1597
for(j=start; j<end; j+=NUMAGG) {
1598
bytes = sizeof(double)*NUMAGG;
1599
if((rc=ARMCI_NbGetS(&dsrc[i][j], NULL, &ddst_get[me][i*elems[1]+j], NULL,
1600
&bytes, 0, i, &usr_hdl_get[i])))
1601
ARMCI_Error("armci_nbputs failed\n",rc);
1604
start = end; end = elems[1];
1605
for(j=start; j<end; j+=NUMAGG) {
1606
bytes = sizeof(double)*NUMAGG;
1607
if((rc=ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1]+j], bytes,
1608
i, &usr_hdl_get[i])))
1609
ARMCI_Error("armci_nbget failed\n",rc);
1613
for(i=0; i<nproc; i++) ARMCI_Wait(&usr_hdl_get[i]);
1620
for(i=0; i<nproc; i++) {
1621
for(j=0; j<elems[1]; j++) {
1622
if( ARMCI_ABS(ddst_put[me][i*elems[1]+j]-j*1.001*(i+1)) > 0.1) {
1623
ARMCI_Error("aggregate put failed...1", 0);
1628
if(me==0) printf(" aggregate put ..O.K.\n"); fflush(stdout);
1630
for(i=0; i<nproc; i++) {
1631
for(j=0; j<elems[1]; j++) {
1632
if( ARMCI_ABS(ddst_get[me][i*elems[1]+j]-j*1.001*(i+1)) > 0.1) {
1633
ARMCI_Error("aggregate get failed...1", 0);
1638
if(me==0) printf(" aggregate get ..O.K.\n"); fflush(stdout);
1643
if(me==0){printf("O.K.\n"); fflush(stdout);}
1644
destroy_array((void **)ddst_put);
1645
destroy_array((void **)ddst_get);
1646
destroy_array((void **)dsrc);
1649
void test_implicit() {
1651
int i, j, k, rc, bytes, elems[2] = {MAXPROC, MAXELEMS};
1652
double *ddst_put[MAXPROC];
1653
double *ddst_get[MAXPROC];
1654
double *dsrc[MAXPROC];
1656
void *src_ptr[MAX_REQUESTS], *dst_ptr[MAX_REQUESTS];
1657
int start = 0, end = 0;
1658
armci_hdl_t usr_hdl[MAXPROC];
1660
create_array((void**)ddst_put, sizeof(double),2, elems);
1661
create_array((void**)ddst_get, sizeof(double),2, elems);
1662
create_array((void**)dsrc, sizeof(double),1, &elems[1]);
1664
for(i=0; i<elems[1]; i++)
1665
dsrc[me][i]=i*1.001*(me+1);
1666
for(i=0; i<elems[0]*elems[1]; i++)
1668
ddst_put[me][i]=0.0;
1669
ddst_get[me][i]=0.0;
1673
for(i=0; i<nproc; i++) ARMCI_INIT_HANDLE(&usr_hdl[i]);
1675
for(i=0; i<nproc; i++)
1678
start = 0; end = COUNT*NUMAGG;
1679
for(j=start; j<end; j++)
1681
bytes = sizeof(double);
1682
ARMCI_NbPutValueDouble(dsrc[me][j], &ddst_put[i][me*elems[1]+j], i,
1686
start = end; end = start + COUNT*NUMAGG;
1687
for(j=start, k=0; j<end; j+=NUMAGG, k++)
1689
src_ptr[k] = (void *)&dsrc[me][j];
1690
dst_ptr[k] = (void *)&ddst_put[i][me*elems[1]+j];
1692
darr.src_ptr_array = src_ptr;
1693
darr.dst_ptr_array = dst_ptr;
1694
darr.bytes = NUMAGG*sizeof(double);
1695
darr.ptr_array_len = k;
1696
if((rc=ARMCI_NbPutV(&darr, 1, i, NULL)))
1697
ARMCI_Error("armci_nbputv failed\n",rc);
1699
start = end; end = start + COUNT*NUMAGG;
1700
for(j=start; j<end; j+=NUMAGG)
1702
bytes = sizeof(double)*NUMAGG;
1703
if((rc=ARMCI_NbPutS(&dsrc[me][j], NULL, &ddst_put[i][me*elems[1]+j], NULL,
1704
&bytes, 0, i, NULL)))
1705
ARMCI_Error("armci_nbputs failed\n",rc);
1708
start = end; end = elems[1];
1709
for(j=start; j<end; j+=NUMAGG)
1711
bytes = sizeof(double)*NUMAGG;
1712
if((rc=ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1]+j], bytes,
1714
ARMCI_Error("armci_nbput failed\n",rc);
1719
for(i=0; i<nproc; i++)
1722
start = 0; end = COUNT*NUMAGG;
1723
for(j=start, k=0; j<end; j+=NUMAGG, k++)
1725
src_ptr[k] = (void *)&dsrc[i][j];
1726
dst_ptr[k] = (void *)&ddst_get[me][i*elems[1]+j];
1728
darr.src_ptr_array = src_ptr;
1729
darr.dst_ptr_array = dst_ptr;
1730
darr.bytes = NUMAGG*sizeof(double);
1731
darr.ptr_array_len = k;
1732
if((rc=ARMCI_NbGetV(&darr, 1, i, NULL)))
1733
ARMCI_Error("armci_nbgetv failed\n", rc);
1735
start = end; end = start + COUNT*NUMAGG;
1736
for(j=start; j<end; j+=NUMAGG)
1738
bytes = sizeof(double)*NUMAGG;
1739
if((rc=ARMCI_NbGetS(&dsrc[i][j], NULL, &ddst_get[me][i*elems[1]+j], NULL,
1740
&bytes, 0, i, NULL)))
1741
ARMCI_Error("armci_nbputs failed\n",rc);
1744
start = end; end = elems[1];
1745
for(j=start; j<end; j+=NUMAGG)
1747
bytes = sizeof(double)*NUMAGG;
1748
if((rc=ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1]+j], bytes,
1750
ARMCI_Error("armci_nbget failed\n",rc);
1761
fprintf(stderr,"nproc: %d, elems[1]: %d\n", nproc, elems[1]);
1762
for(i=0; i<nproc; i++)
1764
for(j=0; j<elems[1]; j++)
1766
if( ARMCI_ABS(ddst_put[me][i*elems[1]+j]-j*1.001*(i+1)) > 0.1)
1768
fprintf(stderr,"ddst_put[%d][%d*elems[1]+%d]-%d*1.001*(%d+1): %ld (elems[1]: %d)\n",
1769
me, i, j, j, i, ddst_put[me][i*elems[1]+j]-j*1.001*(i+1), elems[1]);
1770
ARMCI_Error("implicit handle(s) failed...(a)", 0);
1776
for(i=0; i<nproc; i++) {
1777
for(j=0; j<elems[1]; j++) {
1778
if( ARMCI_ABS(ddst_get[me][i*elems[1]+j]-j*1.001*(i+1)) > 0.1) {
1779
ARMCI_Error("implicit handles(s) failed...(b)", 0);
1788
if(me==0){printf("O.K.\n\n"); fflush(stdout);}
1789
destroy_array((void **)ddst_put);
1790
destroy_array((void **)ddst_get);
1791
destroy_array((void **)dsrc);
1795
/* we need to rename main if linking with frt compiler */
1800
int main(int argc, char* argv[])
1804
MP_INIT(argc, argv);
1808
/* printf("nproc = %d, me = %d\n", nproc, me);*/
1810
if(nproc>MAXPROC && me==0)
1811
ARMCI_Error("Test works for up to %d processors\n",MAXPROC);
1814
printf("ARMCI test program (%d processes)\n",nproc);
1819
ARMCI_Init_args(&argc, &argv);
1822
if(me==1)armci_die("process 1 committing suicide",1);
1825
printf("\nTesting strided gets and puts\n");
1826
printf("(Only std output for process 0 is printed)\n\n");
1830
for(ndim=1; ndim<= MAXDIMS; ndim++) test_dim(ndim);
1835
printf("\nTesting non-blocking gets and puts\n");
1844
printf("\nTesting non-blocking vector gets and puts\n");
1853
printf("\nTesting atomic accumulate\n");
1857
for(ndim=1; ndim<= MAXDIMS; ndim++) test_acc(ndim);
1862
printf("\nTesting Vector Interface using triangular patches of a 2-D array\n\n");
1872
printf("\nTesting Accumulate with Vector Interface\n\n");
1882
printf("\nTesting atomic fetch&add\n");
1883
printf("(Std Output for all processes is printed)\n\n");
1895
printf("\nTesting atomic swap\n");
1903
printf("\nTesting register-originated put and get\n");
1912
printf("\nTesting aggregate put/get requests\n");
1917
* Aggregate put/get requests cannot be tested for\ number of procs
1918
* greater than 32. (Current implementation of aggregate put/get
1919
* can use at the maximum of 32 handles (defined by macro
1920
* _MAX_AGG_BUFFERS in aggregate.c). This test case is written in
1921
* such a way that each process puts/gets data to all the other
1922
* processes, thus the number of aggregate handle used is equal to
1923
* the number of processes created.
1927
printf("\n WARNING: Aggregate put/get requests cannot be tested for number of procs greater than 32.\n\n");
1938
printf("\nTesting implicit handles\n");
1951
if(me==0){printf("All tests passed\n"); fflush(stdout);}
1957
for(i=0; i<g_idx; i++)
1958
for(j=0; j<nproc; j++)
1959
ARMCI_Memdt(&meminfo[i][j], 0);
1960
for(i=0; i<g_idx; i++)
1961
ARMCI_Memctl(&meminfo[i][me]);