5
/* $Id: test.c,v 1.43.6.6 2007-08-30 22:59:27 manoj Exp $ */
19
# define sleep(x) Sleep(1000*(x))
25
extern void armci_lockmem(void *, void *, int);
26
extern void armci_unlockmem(void);
31
/* Solaris has shared memory shortages in the default system configuration */
35
#elif defined(__alpha__)
49
#define EDIM1 (DIM1+OFF)
50
#define EDIM2 (DIM2+OFF)
51
#define EDIM3 (DIM3+OFF)
52
#define EDIM4 (DIM4+OFF)
53
#define EDIM5 (DIM5+OFF)
54
#define EDIM6 (DIM6+OFF)
55
#define EDIM7 (DIM7+OFF)
59
#define MAX_DIM_VAL 50
73
/***************************** macros ************************/
74
#define COPY(src, dst, bytes) memcpy((dst),(src),(bytes))
75
#define ARMCI_MAX(a,b) (((a) >= (b)) ? (a) : (b))
76
#define ARMCI_MIN(a,b) (((a) <= (b)) ? (a) : (b))
77
#define ARMCI_ABS(a) (((a) <0) ? -(a) : (a))
79
/***************************** global data *******************/
81
int work[MAXPROC]; /* work array for propagating addresses */
86
void pvm_init(int argc, char *argv[])
88
int mytid, mygid, ctid[MAXPROC];
92
if ((argc != 2) && (argc != 1)) {
99
if ((np = atoi(argv[1])) < 1) {
107
mygid = pvm_joingroup(MPGROUP);
111
i = pvm_spawn(argv[0], argv + 1, 0, "", np - 1, ctid);
115
while (pvm_gsize(MPGROUP) < np) {
120
pvm_barrier(MPGROUP, np);
122
printf("PVM initialization done!\n");
127
fprintf(stderr, "usage: %s <nproc>\n", argv[0]);
133
/*\ generate random range for a section of multidimensional array
135
void get_range(int ndim, int dims[], int lo[], int hi[])
138
for (dim = 0; dim < ndim; dim++) {
140
toss1 = rand() % dims[dim];
141
toss2 = rand() % dims[dim];
155
/*\ generates a new random range similar to the input range for an array with specified dimensions
157
void new_range(int ndim, int dims[], int lo[], int hi[], int new_lo[], int new_hi[])
160
for (dim = 0; dim < ndim; dim++) {
162
int diff = hi[dim] - lo[dim] + 1;
163
assert(diff <= dims[dim]);
164
range = dims[dim] - diff;
165
toss = (range > 0) ? rand() % range : lo[dim];
167
new_hi[dim] = toss + diff - 1;
168
assert(new_hi[dim] < dims[dim]);
169
assert(diff == (new_hi[dim] - new_lo[dim] + 1));
177
/*\ print range of ndim dimensional array with two strings before and after
179
void print_range(char *pre, int ndim, int lo[], int hi[], char *post)
184
for (i = 0; i < ndim; i++) {
185
printf("%d:%d", lo[i], hi[i]);
187
printf("] %s", post);
195
/*\ print subscript of ndim dimensional array with two strings before and after
197
void print_subscript(char *pre, int ndim, int subscript[], char *post)
202
for (i = 0; i < ndim; i++) {
203
printf("%d", subscript[i]);
205
printf("] %s", post);
214
/*\ print a section of a 2-D array of doubles
216
void print_2D_double(double *a, int ld, int *lo, int *hi)
219
for (i = lo[0]; i <= hi[0]; i++) {
220
for (j = lo[1]; j <= hi[1]; j++) {
221
printf("%13f ", a[ld*j+i]);
228
/*\ initialize array: a[i,j,k,..]=i+100*j+10000*k+ ...
230
void init(double *a, int ndim, int elems, int dims[])
235
for (i = 0; i < elems; i++) {
239
for (dim = 0; dim < ndim; dim++) {
240
idx[dim] = Index % dims[dim];
246
for (dim = 0; dim < ndim; dim++) {
247
val += field * idx[dim];
251
/* printf("(%d,%d,%d)=%6.0f",idx[0],idx[1],idx[2],val); */
256
/*\ compute Index from subscript
257
* assume that first subscript component changes first
259
int Index(int ndim, int subscript[], int dims[])
261
int idx = 0, i, factor = 1;
262
for (i = 0; i < ndim; i++) {
263
idx += subscript[i] * factor;
270
void update_subscript(int ndim, int subscript[], int lo[], int hi[], int dims[])
273
for (i = 0; i < ndim; i++) {
274
if (subscript[i] < hi[i]) {
278
subscript[i] = lo[i];
284
void compare_patches(double eps, int ndim, double *patch1, int lo1[], int hi1[],
285
int dims1[], double *patch2, int lo2[], int hi2[],
290
int subscr1[MAXDIMS], subscr2[MAXDIMS];
292
int offset1, offset2;
294
for (i = 0; i < ndim; i++) { /* count # of elements & verify consistency of both patches */
295
int diff = hi1[i] - lo1[i];
296
assert(diff == (hi2[i] - lo2[i]));
297
assert(diff < dims1[i]);
298
assert(diff < dims2[i]);
305
/* compare element values in both patches */
306
offset1 = Index(ndim, subscr1, dims1);
307
offset2 = Index(ndim, subscr2, dims2);
308
for (j = 0; j < elems; j++) {
311
idx1 = Index(ndim, subscr1, dims1); /* calculate element Index from a subscript */
312
idx2 = Index(ndim, subscr2, dims2);
318
diff = patch1[idx1] - patch2[idx2];
319
max = ARMCI_MAX(ARMCI_ABS(patch1[idx1]), ARMCI_ABS(patch2[idx2]));
320
if (max == 0. || max < eps) {
324
if (eps < ARMCI_ABS(diff) / max) {
326
sprintf(msg, "(proc=%d):%f", me, patch1[idx1]);
327
print_subscript("ERROR: a", ndim, subscr1, msg);
328
sprintf(msg, "%f\n", patch2[idx2]);
329
print_subscript(" b", ndim, subscr2, msg);
332
ARMCI_Error("Bailing out", 0);
335
{ /* update subscript for the patches */
336
update_subscript(ndim, subscr1, lo1, hi1, dims1);
337
update_subscript(ndim, subscr2, lo2, hi2, dims2);
343
/* make sure we reached upper limit */
344
/*for(i=0;i<ndim;i++){
345
assert(subscr1[i]==hi1[i]);
346
assert(subscr2[i]==hi2[i]);
351
void scale_patch(double alpha, int ndim, double *patch1, int lo1[], int hi1[], int dims1[])
354
int subscr1[MAXDIMS];
357
for (i = 0; i < ndim; i++) { /* count # of elements in patch */
358
int diff = hi1[i] - lo1[i];
359
assert(diff < dims1[i]);
364
/* scale element values in both patches */
365
offset1 = Index(ndim, subscr1, dims1);
366
for (j = 0; j < elems; j++) {
368
idx1 = Index(ndim, subscr1, dims1); /* calculate element Index from a subscript */
370
patch1[idx1] *= alpha;
371
update_subscript(ndim, subscr1, lo1, hi1, dims1);
376
/* #define NEWMALLOC */
378
armci_meminfo_t meminfo[MMAX][MAXPROC];
382
void create_array(void *a[], int elem_size, int ndim, int dims[])
384
int bytes = elem_size, i, rc;
386
assert(ndim <= MAXDIMS);
387
for (i = 0; i < ndim; i++) {
393
ARMCI_Error("increase MMAX", g_idx);
395
ARMCI_Memget(bytes, &meminfo[g_idx][me], 0);
397
for (i = 0; i < nproc; i++) {
398
armci_msg_brdcst(&meminfo[g_idx][i], sizeof(armci_meminfo_t), i);
401
for (i = 0; i < nproc; i++) {
402
a[i] = ARMCI_Memat(&meminfo[g_idx][i], 0);
407
rc = ARMCI_Malloc(a, bytes);
414
void destroy_array(void *ptr[])
418
assert(!ARMCI_Free(ptr[me]));
423
int loA[MAXDIMS], hiA[MAXDIMS];
424
int dimsA[MAXDIMS] = {DIM1, DIM2, DIM3, DIM4, DIM5, DIM6, DIM7};
425
int loB[MAXDIMS], hiB[MAXDIMS];
426
int dimsB[MAXDIMS] = {EDIM1, EDIM2, EDIM3, EDIM4, EDIM5, EDIM6, EDIM7};
428
int strideA[MAXDIMS], strideB[MAXDIMS];
429
int loC[MAXDIMS], hiC[MAXDIMS];
430
int idx[MAXDIMS] = {0, 0, 0, 0, 0, 0, 0};
433
void test_dim(int ndim)
437
/* double a[DIM4][DIM3][DIM2][DIM1], b[EDIM4][EDIM3][EDIM2][EDIM1];*/
442
strideA[0] = sizeof(double);
443
strideB[0] = sizeof(double);
444
for (i = 0; i < ndim; i++) {
445
strideA[i] *= dimsA[i];
446
strideB[i] *= dimsB[i];
448
strideA[i+1] = strideA[i];
449
strideB[i+1] = strideB[i];
454
/* create shared and local arrays */
455
create_array(b, sizeof(double), ndim, dimsB);
456
a = malloc(sizeof(double) * elems);
458
c = malloc(sizeof(double) * elems);
461
init(a, ndim, elems, dimsA);
464
printf("--------array[%d", dimsA[0]);
465
for (dim = 1; dim < ndim; dim++) {
466
printf(",%d", dimsA[dim]);
468
printf("]--------\n");
474
for (i = 0; i < LOOP; i++) {
475
int idx1, idx2, idx3;
476
get_range(ndim, dimsA, loA, hiA);
477
new_range(ndim, dimsB, loA, hiA, loB, hiB);
478
new_range(ndim, dimsA, loA, hiA, loC, hiC);
480
proc = nproc - 1 - me;
483
print_range("local", ndim, loA, hiA, "-> ");
484
print_range("remote", ndim, loB, hiB, "-> ");
485
print_range("local", ndim, loC, hiC, "\n");
488
idx1 = Index(ndim, loA, dimsA);
489
idx2 = Index(ndim, loB, dimsB);
490
idx3 = Index(ndim, loC, dimsA);
492
for (j = 0; j < ndim; j++) {
493
count[j] = hiA[j] - loA[j] + 1;
496
count[0] *= sizeof(double); /* convert range to bytes at stride level zero */
498
(void)ARMCI_PutS((double *)a + idx1, strideA, (double *)b[proc] + idx2, strideB, count, ndim - 1, proc);
502
/* printf("%d: a=(%x,%f) b=(%x,%f)\n",me,idx1 + (double*)a,*(idx1 + (double*)a),idx2 + (double*)b,*(idx2 + (double*)b));*/
506
/* note that we do not need ARMCI_Fence here since
507
* consectutive operations targeting the same process are ordered */
508
(void)ARMCI_GetS((double *)b[proc] + idx2, strideB, (double *)c + idx3, strideA, count, ndim - 1, proc);
510
compare_patches(0., ndim, (double *)a + idx1, loA, hiA, dimsA, (double *)c + idx3, loC, hiC, dimsA);
520
int nloA[MAXDIMS+1][MAXDIMS], nhiA[MAXDIMS+1][MAXDIMS];
521
int nloB[MAXDIMS+1][MAXDIMS], nhiB[MAXDIMS+1][MAXDIMS];
522
int nloC[MAXDIMS+1][MAXDIMS], nhiC[MAXDIMS+1][MAXDIMS];
524
int get_next_RRproc(int initialize, int ndim)
529
distance = nproc / 2;
530
if ((nproc % 2) != 0) {
538
/*send it to a different process everytime*/
539
proc = (me <= ((nproc % 2 == 0) ? ((nproc / 2) - 1) : (nproc / 2))) ? (me + distance) : (me - distance);
540
if ((nproc % 2) != 0 && me == (nproc / 2)) {
544
if (me < (nproc / 2)) {
546
if ((me + distance) >= nproc) {
547
distance = nproc / 2;
548
if ((nproc % 2) != 0) {
556
if ((me - distance) >= (nproc / 2)) {
557
distance = nproc / 2;
558
if ((nproc % 2) != 0) {
561
distance = distance + (me - distance);
564
if (ndim != 1 && MAXDIMS > nproc && (ndim % (nproc / 2) == 0)) {
565
distance = nproc / 2;
566
if ((nproc % 2) != 0) {
576
int elems = 1, elems1 = 1;
577
int i, j, proc, ndim, rc;
578
void *b[MAXDIMS+1][MAXPROC];
579
void *a[MAXDIMS+1], *c[MAXDIMS+1];
580
armci_hdl_t hdl_put[MAXDIMS+1], hdl_get[MAXDIMS+1];
581
int idx1 = 0, idx2 = 0, idx3 = 0;
582
/* create shared and local arrays */
583
for (ndim = 1; ndim <= MAXDIMS; ndim++) {
584
elems1 *= dimsB[ndim-1];
585
elems *= dimsA[ndim-1];
586
rc = ARMCI_Malloc(b[ndim], sizeof(double) * elems1);
589
a[ndim] = malloc(sizeof(double) * elems);
591
c[ndim] = malloc(sizeof(double) * elems);
593
init(a[ndim], ndim, elems, dimsA);
594
ARMCI_INIT_HANDLE(hdl_put + ndim);
595
ARMCI_INIT_HANDLE(hdl_get + ndim);
600
(void)get_next_RRproc(1, 0);
601
for (ndim = 1; ndim <= MAXDIMS; ndim++) {
602
strideA[0] = sizeof(double);
603
strideB[0] = sizeof(double);
604
for (i = 0; i < ndim; i++) {
605
strideA[i] *= dimsA[i];
606
strideB[i] *= dimsB[i];
608
strideA[i+1] = strideA[i];
609
strideB[i+1] = strideB[i];
612
proc = get_next_RRproc(0, ndim);
613
get_range(ndim, dimsA, nloA[ndim], nhiA[ndim]);
614
new_range(ndim, dimsB, nloA[ndim], nhiA[ndim], nloB[ndim],
616
new_range(ndim, dimsA, nloA[ndim], nhiA[ndim], nloC[ndim],
619
print_range("local", ndim, nloA[ndim], nhiA[ndim], "-> ");
620
print_range("remote", ndim, nloB[ndim], nhiB[ndim], "-> ");
621
print_range("local", ndim, nloC[ndim], nhiC[ndim], "\n");
626
idx1 = Index(ndim, nloA[ndim], dimsA);
627
idx2 = Index(ndim, nloB[ndim], dimsB);
628
idx3 = Index(ndim, nloC[ndim], dimsA);
629
for (j = 0; j < ndim; j++) {
630
count[j] = nhiA[ndim][j] - nloA[ndim][j] + 1;
632
count[0] *= sizeof(double);
635
(void)ARMCI_NbPut((double *)a[ndim] + idx1, (double *)b[ndim][proc] + idx2,
636
count[0], proc, (hdl_put + ndim));
639
(void)ARMCI_NbPutS((double *)a[ndim] + idx1, strideA,
640
(double *)b[ndim][proc] + idx2,
641
strideB, count, ndim - 1, proc, (hdl_put + ndim));
646
/*before we do gets, we have to make sure puts are complete
647
on the remote processor*/
648
for (ndim = 1; ndim <= MAXDIMS; ndim++) {
649
ARMCI_Wait(hdl_put + ndim);
654
(void)get_next_RRproc(1, 0);
656
for (ndim = 1; ndim <= MAXDIMS; ndim++) {
657
strideA[0] = sizeof(double);
658
strideB[0] = sizeof(double);
659
for (i = 0; i < ndim; i++) {
660
strideA[i] *= dimsA[i];
661
strideB[i] *= dimsB[i];
663
strideA[i+1] = strideA[i];
664
strideB[i+1] = strideB[i];
667
/*send it to a different process everytime*/
668
proc = get_next_RRproc(0, ndim);
670
idx1 = Index(ndim, nloA[ndim], dimsA);
671
idx2 = Index(ndim, nloB[ndim], dimsB);
672
idx3 = Index(ndim, nloC[ndim], dimsA);
673
for (j = 0; j < ndim; j++) {
674
count[j] = nhiA[ndim][j] - nloA[ndim][j] + 1;
676
count[0] *= sizeof(double);
678
(void)ARMCI_NbGet((double *)b[ndim][proc] + idx2, (double *)c[ndim] + idx3,
679
count[0], proc, (hdl_get + ndim));
682
(void)ARMCI_NbGetS((double *)b[ndim][proc] + idx2, strideB,
683
(double *)c[ndim] + idx3,
684
strideA, count, ndim - 1, proc, (hdl_get + ndim));
690
printf("Now waiting for all non-blocking calls and verifying data...\n");
693
for (ndim = 1; ndim <= MAXDIMS; ndim++) {
694
ARMCI_Wait(hdl_get + ndim);
695
idx1 = Index(ndim, nloA[ndim], dimsA);
696
idx2 = Index(ndim, nloB[ndim], dimsB);
697
idx3 = Index(ndim, nloC[ndim], dimsA);
698
compare_patches(0., ndim, (double *)a[ndim] + idx1, nloA[ndim], nhiA[ndim],
699
dimsA, (double *)c[ndim] + idx3, nloC[ndim], nhiC[ndim], dimsA);
706
for (ndim = 1; ndim <= MAXDIMS; ndim++) {
707
destroy_array(b[ndim]);
713
#define PTR_ARR_LEN 10
715
#define VEC_ELE_LEN 20 /*number of doubles in each dimention*/
716
#define GIOV_ARR_LEN 9
718
void verify_vector_data(double *data, int procs, int isput, int datalen)
721
int i, j = 0, k = 0, kc = 0, dst = 0;
725
for (i = 0; i < datalen; i++) {
727
if (ARMCI_ABS((data[i] - (me + facto + dst)*((kc + 1)*(j % PTR_ARR_LEN + 1)))) > 0.001) {
728
printf("\n%d:while verifying data of a op from proc=%d ", me, dst);
729
printf("giov index=%d ptr_arr_index=%d \n :element index=%d", kc,
730
(j % PTR_ARR_LEN), k);
731
printf(" elem was supposed to be %f but is %f",
732
(me + facto + dst)*((kc + 1)*(j % PTR_ARR_LEN + 1)) , data[i]);
735
ARMCI_Error("vector non-blocking failed", 0);
738
if (k == VEC_ELE_LEN) {
741
if (j % PTR_ARR_LEN == 0) {
743
if ((kc % GIOV_ARR_LEN) == 0) {
752
void test_vec_small()
756
armci_giov_t dsc[MAXPROC*GIOV_ARR_LEN];
757
void **psrc; /*arrays of pointers to be used by giov_t*/
759
void *getsrc[MAXPROC]; /*to allocate mem via armci_malloc*/
760
void *putdst[MAXPROC]; /*to allocate mem via armci_malloc*/
761
armci_hdl_t hdl_put[MAXPROC], hdl_get[MAXPROC];
762
int i = 0, j = 0, k = 0, kc = 0, kcold = 0, rc, dstproc, dst = 0;
765
lenpergiov = PTR_ARR_LEN * VEC_ELE_LEN;
766
rc = ARMCI_Malloc(getsrc, sizeof(double) * nproc * GIOV_ARR_LEN * lenpergiov);
769
rc = ARMCI_Malloc(putdst, sizeof(double) * nproc * GIOV_ARR_LEN * lenpergiov);
773
/*first malloc for getdst and putsrc, both are 2d arrays*/
774
getdst = (double *)malloc(sizeof(double) * nproc * GIOV_ARR_LEN * lenpergiov);
775
putsrc = (double **)malloc(sizeof(double *) * nproc * GIOV_ARR_LEN * PTR_ARR_LEN);
778
for (i = 0; i < nproc * GIOV_ARR_LEN * PTR_ARR_LEN; i++) {
779
putsrc[i] = (double *)malloc(sizeof(double) * VEC_ELE_LEN);
782
/*allocating memory for psrc and pdst*/
783
psrc = (void **)malloc(sizeof(void *) * PTR_ARR_LEN * nproc * GIOV_ARR_LEN);
784
pdst = (void **)malloc(sizeof(void *) * PTR_ARR_LEN * nproc * GIOV_ARR_LEN);
788
for (i = 0; i < nproc * lenpergiov * GIOV_ARR_LEN; i++) {
789
putsrc[j][k] = (me + 1.89 + dst) * ((kc + 1) * ((j % PTR_ARR_LEN) + 1));
790
((double *)getsrc[me])[i] = (me + 2.89 + dst) * ((kc + 1) * (j % PTR_ARR_LEN + 1));
792
if (k == VEC_ELE_LEN) {
795
if ((j % PTR_ARR_LEN) == 0) {
797
if ((kc % GIOV_ARR_LEN) == 0) {
804
/*********************Testing NbPutV*********************************/
810
for (i = 0; i < nproc - 1; i++) {
812
if (dstproc == nproc) {
815
for (j = 0; j < GIOV_ARR_LEN; j++) {
817
for (k = 0; k < PTR_ARR_LEN; k++, kc++) {
819
psrc[kc] = (void *)putsrc[PTR_ARR_LEN*(dstproc*GIOV_ARR_LEN+j)+k];
820
ptr = (double *)putdst[dstproc];
821
pdst[kc] = (void *)(ptr + lenpergiov * (GIOV_ARR_LEN * me + j) + k * VEC_ELE_LEN);
823
dsc[j].bytes = VEC_ELE_LEN * sizeof(double);
824
dsc[j].src_ptr_array = &psrc[kcold];
825
dsc[j].dst_ptr_array = &pdst[kcold];
826
dsc[j].ptr_array_len = PTR_ARR_LEN;
828
ARMCI_INIT_HANDLE(hdl_put + dstproc);
829
if ((rc = ARMCI_NbPutV(dsc, GIOV_ARR_LEN, dstproc, hdl_put + dstproc))) {
830
ARMCI_Error("putv failed", rc);
834
printf("\n\tNow veryfying the vector put data for correctness");
836
for (i = 0; i < nproc; i++)if (i != me) {
837
ARMCI_Wait(hdl_put + i);
841
ARMCI_AllFence();/*every one syncs after put */
842
verify_vector_data((double *)putdst[me], nproc, 1, nproc * GIOV_ARR_LEN * lenpergiov);
844
printf("\n\tPuts OK\n");
846
/****************Done Testing NbPutV*********************************/
848
/*********************Testing NbGetV*********************************/
854
for (i = 0; i < nproc - 1; i++) {
856
if (dstproc == nproc) {
859
for (j = 0; j < GIOV_ARR_LEN; j++) {
861
for (k = 0; k < PTR_ARR_LEN; k++, kc++) {
864
pdst[kc] = (void *)(ptr + lenpergiov * (dstproc * GIOV_ARR_LEN + j) + k * VEC_ELE_LEN);
865
ptr = (double *)(getsrc[dstproc]);
866
psrc[kc] = (void *)(ptr + lenpergiov * (me * GIOV_ARR_LEN + j) + k * VEC_ELE_LEN);
868
dsc[j].bytes = VEC_ELE_LEN * sizeof(double);
869
dsc[j].src_ptr_array = &psrc[kcold];
870
dsc[j].dst_ptr_array = &pdst[kcold];
871
dsc[j].ptr_array_len = PTR_ARR_LEN;
873
ARMCI_INIT_HANDLE(hdl_get + dstproc);
874
if ((rc = ARMCI_NbGetV(dsc, GIOV_ARR_LEN, dstproc, hdl_get + dstproc))) {
875
ARMCI_Error("putv failed", rc);
879
printf("\n\tNow veryfying the vector get data for correctness");
881
for (i = 0; i < nproc; i++)if (i != me) {
882
ARMCI_Wait(hdl_get + i);
886
verify_vector_data((double *)getdst, nproc, 0, nproc * GIOV_ARR_LEN * lenpergiov);
888
printf("\n\tGets OK\n");
890
/****************Done Testing NbGetV*********************************/
894
for (i = 0; i < nproc * GIOV_ARR_LEN * PTR_ARR_LEN; i++) {
902
void GetPermutedProcList(int *ProcList)
906
if (nproc > MAXPROC) {
907
ARMCI_Error("permute_proc: nproc to big ", nproc);
910
/* initialize list */
911
for (i = 0; i < nproc; i++) {
918
/* every process generates different random sequence */
919
(void)srand((unsigned)me);
921
/* list permutation generated by random swapping */
922
for (i = 0; i < nproc; i++) {
923
iswap = (int)(rand() % nproc);
924
temp = ProcList[iswap];
925
ProcList[iswap] = ProcList[i];
932
/*\ Atomic Accumulate test: remote += alpha*local
933
* Every process/or has its patch of array b updated TIMES*NPROC times.
934
* The sequence of updates is random: everybody uses a randomly permuted list
935
* and accumulate is non-collective (of-course)
937
void test_acc(int ndim)
943
double alpha = 0.1, scale;
945
int *proclist = work;
948
strideA[0] = sizeof(double);
949
strideB[0] = sizeof(double);
950
for (i = 0; i < ndim; i++) {
951
strideA[i] *= dimsA[i];
952
strideB[i] *= dimsB[i];
954
strideA[i+1] = strideA[i];
955
strideB[i+1] = strideB[i];
959
/* set up patch coordinates: same on every processor */
962
loB[i] = dimsB[i] - 2;
964
count[i] = hiA[i] - loA[i] + 1;
967
/* create shared and local arrays */
968
create_array(b, sizeof(double), ndim, dimsB);
969
a = malloc(sizeof(double) * elems);
971
c = malloc(sizeof(double) * elems);
974
init(a, ndim, elems, dimsA);
977
printf("--------array[%d", dimsA[0]);
978
for (dim = 1; dim < ndim; dim++) {
979
printf(",%d", dimsA[dim]);
981
printf("]--------\n");
984
GetPermutedProcList(proclist);
986
idx1 = Index(ndim, loA, dimsA);
987
idx2 = Index(ndim, loB, dimsB);
988
count[0] *= sizeof(double); /* convert range to bytes at stride level zero */
990
/* initialize all elements of array b to zero */
992
for (i = 0; i < ndim; i++) {
995
for (i = 0; i < elems; i++) {
996
((double *)b[me])[i] = 0.;
1002
print_range("patch", ndim, loA, hiA, " -> ");
1003
print_range("patch", ndim, loB, hiB, "\n");
1009
for (i = 0; i < TIMES * nproc; i++) {
1010
proc = proclist[i%nproc];
1011
(void)ARMCI_AccS(ARMCI_ACC_DBL, &alpha, (double *)a + idx1, strideA,
1012
(double *)b[proc] + idx2, strideB, count, ndim - 1, proc);
1019
/* copy my patch into local array c */
1020
(void)ARMCI_GetS((double *)b[me] + idx2, strideB, (double *)c + idx1, strideA, count, ndim - 1, me);
1022
scale = alpha * TIMES * nproc;
1024
scale_patch(scale, ndim, (double *)a + idx1, loA, hiA, dimsA);
1026
compare_patches(.0001, ndim, (double *)a + idx1, loA, hiA, dimsA, (double *)c + idx1, loA, hiA, dimsA);
1040
/*************************** vector interface *********************************\
1041
* tests vector interface for transfers of triangular sections of a 2-D array *
1042
******************************************************************************/
1045
int dim, elems, ndim, cols, rows, mrc;
1051
armci_giov_t dsc[MAX_DIM_VAL];
1052
void *psrc[MAX_DIM_VAL];
1053
void *pdst[MAX_DIM_VAL];
1057
for (i = 0; i < ndim; i++) {
1058
dimsA[i] = MAX_DIM_VAL;
1059
dimsB[i] = MAX_DIM_VAL + 1;
1063
/* create shared and local arrays */
1064
create_array(b, sizeof(double), ndim, dimsB);
1065
a = malloc(sizeof(double) * elems);
1067
c = malloc(sizeof(double) * elems);
1070
init(a, ndim, elems, dimsA);
1073
printf("--------array[%d", dimsA[0]);
1074
for (dim = 1; dim < ndim; dim++) {
1075
printf(",%d", dimsA[dim]);
1077
printf("]--------\n");
1081
for (loop = 0; loop < LOOP; loop++) {
1082
get_range(ndim, dimsA, loA, hiA);
1083
new_range(ndim, dimsB, loA, hiA, loB, hiB);
1084
new_range(ndim, dimsA, loA, hiA, loC, hiC);
1086
proc = nproc - 1 - me;
1089
print_range("local", ndim, loA, hiA, "-> ");
1090
print_range("remote", ndim, loB, hiB, "-> ");
1091
print_range("local", ndim, loC, hiC, "\n");
1094
/* printf("array at source\n");*/
1095
/* print_2D_double((double *)a, dimsA[0], loA, hiA);*/
1097
cols = hiA[1] - loA[1] + 1;
1098
rows = hiA[0] - loA[0] + 1;
1099
mrc = ARMCI_MIN(cols, rows);
1101
/* generate a data descriptor for a lower-triangular patch */
1102
for (i = 0; i < mrc; i++) {
1108
idx = Index(ndim, ij, dimsA);
1109
psrc[i] = (double *)a + idx;
1113
idx = Index(ndim, ij, dimsB);
1114
pdst[i] = (double *)b[proc] + idx;
1116
dsc[i].bytes = (rows - i) * sizeof(double);
1117
dsc[i].src_ptr_array = &psrc[i];
1118
dsc[i].dst_ptr_array = &pdst[i];
1120
/* assume each element different in size (not true in rectangular patches) */
1121
dsc[i].ptr_array_len = 1;
1124
if ((rc = ARMCI_PutV(dsc, mrc, proc))) {
1125
ARMCI_Error("putv failed ", rc);
1128
/* printf("array at destination\n");*/
1129
/* print_2D_double((double *)b[proc], dimsB[0], loB, hiB);*/
1131
/* generate a data descriptor for the upper-triangular patch */
1132
/* there is one less element since diagonal is excluded */
1133
for (i = 1; i < cols; i++) {
1138
psrc[i-1] = (double *)a + Index(ndim, ij, dimsA);
1142
pdst[i-1] = (double *)b[proc] + Index(ndim, ij, dimsB);
1144
mrc = ARMCI_MIN(i, rows);
1145
dsc[i-1].bytes = mrc * sizeof(double);
1146
dsc[i-1].src_ptr_array = &psrc[i-1];
1147
dsc[i-1].dst_ptr_array = &pdst[i-1];
1149
/* assume each element different in size (not true in rectangular patches) */
1150
dsc[i-1].ptr_array_len = 1;
1153
if ((cols - 1))if ((rc = ARMCI_PutV(dsc, cols - 1, proc))) {
1154
ARMCI_Error("putv(2) failed ", rc);
1157
/* we get back entire rectangular patch */
1158
for (i = 0; i < cols; i++) {
1162
psrc[i] = (double *)b[proc] + Index(ndim, ij, dimsB);
1166
pdst[i] = (double *)c + Index(ndim, ij, dimsA);
1169
dsc[0].bytes = rows * sizeof(double);
1170
dsc[0].src_ptr_array = psrc;
1171
dsc[0].dst_ptr_array = pdst;
1172
dsc[0].ptr_array_len = cols;
1174
/* note that we do not need ARMCI_Fence here since
1175
* consecutive operations targeting the same process are ordered */
1176
if ((rc = ARMCI_GetV(dsc, 1, proc))) {
1177
ARMCI_Error("getv failed ", rc);
1180
idx1 = Index(ndim, loA, dimsA);
1181
idx3 = Index(ndim, loC, dimsA);
1182
compare_patches(0., ndim, (double *)a + idx1, loA, hiA, dimsA, (double *)c + idx3, loC, hiC, dimsA);
1192
/*\ Atomic Accumulate test for vector API: remote += alpha*local
1193
* Every process/or has its patch of array b updated TIMES*NPROC times.
1194
* The sequence of updates is random: everybody uses a randomly permuted list
1195
* and accumulate is non-collective (of-course)
1197
void test_vector_acc()
1199
int dim, elems, bytes;
1200
int i, j, proc, rc, one = 1;
1202
void *psrc[ELEMS/2], *pdst[ELEMS/2];
1204
double alpha = 0.1, scale;
1205
int *proclist = work;
1210
bytes = sizeof(double) * elems;
1212
/* create shared and local arrays */
1213
create_array(b, sizeof(double), dim, &elems);
1219
init(a, dim, elems, &elems);
1222
printf("--------array[%d", elems);
1223
printf("]--------\n");
1227
GetPermutedProcList(proclist);
1229
/* initialize all elements of array b to zero */
1230
for (i = 0; i < elems; i++) {
1231
((double *)b[me])[i] = 0.;
1236
dsc.bytes = sizeof(double);
1237
dsc.src_ptr_array = psrc;
1238
dsc.dst_ptr_array = pdst;
1239
dsc.ptr_array_len = elems / 2;
1243
for (i = 0; i < TIMES * nproc; i++) {
1245
/* proc=proclist[i%nproc];*/
1248
/* accumulate even numbered elements */
1249
for (j = 0; j < elems / 2; j++) {
1250
psrc[j] = 2 * j + (double *)a;
1251
pdst[j] = 2 * j + (double *)b[proc];
1253
if ((rc = ARMCI_AccV(ARMCI_ACC_DBL, &alpha, &dsc, 1, proc))) {
1254
ARMCI_Error("accumlate failed", rc);
1256
/* for(j=0; j<elems; j++)
1257
printf("%d %lf %lf\n",j, *(j+ (double*)b[proc]), *(j+ (double*)a));
1259
/* accumulate odd numbered elements */
1260
for (j = 0; j < elems / 2; j++) {
1261
psrc[j] = 2 * j + 1 + (double *)a;
1262
pdst[j] = 2 * j + 1 + (double *)b[proc];
1264
(void)ARMCI_AccV(ARMCI_ACC_DBL, &alpha, &dsc, 1, proc);
1266
/* for(j=0; j<elems; j++)
1267
printf("%d %lf %lf\n",j, *(j+ (double*)a), *(j+ (double*)b[proc]));
1274
/* copy my patch into local array c */
1275
assert(!ARMCI_Get((double *)b[proc], c, bytes, proc));
1277
/* scale = alpha*TIMES*nproc; */
1278
scale = alpha * TIMES * nproc * nproc;
1279
scale_patch(scale, dim, a, &one, &elems, &elems);
1281
compare_patches(.0001, dim, a, &one, &elems, &elems, c, &one, &elems, &elems);
1290
destroy_array((void **)b);
1296
void test_fetch_add()
1298
int rc, bytes, i, val, times = 0;
1301
/* shared variable is located on processor 0 */
1302
bytes = me == 0 ? sizeof(int) : 0;
1304
rc = ARMCI_Malloc((void **)arr, bytes);
1309
*arr[0] = 0; /* initialization */
1314
/* show what everybody gets */
1315
rc = ARMCI_Rmw(ARMCI_FETCH_AND_ADD, &val, arr[0], 1, 0);
1318
for (i = 0; i < nproc; i++) {
1320
printf("process %d got value of %d\n", i, val);
1327
printf("\nIncrement the shared counter until reaches %d\n", LOOP);
1333
/* now increment the counter value until reaches LOOP */
1334
while (val < LOOP) {
1335
rc = ARMCI_Rmw(ARMCI_FETCH_AND_ADD, &val, arr[0], 1, 0);
1340
for (i = 0; i < nproc; i++) {
1342
printf("process %d incremented the counter %d times value=%d\n", i, times, val);
1350
*arr[0] = 0; /* set it back to 0 */
1353
printf("\nNow everybody increments the counter %d times\n", LOOP);
1360
for (i = 0; i < LOOP; i++) {
1361
rc = ARMCI_Rmw(ARMCI_FETCH_AND_ADD, &val, arr[0], 1, 0);
1369
printf("The final value is %d, should be %d.\n\n", *arr[0], LOOP * nproc);
1371
if (*arr[0] != LOOP * nproc) {
1372
ARMCI_Error("failed ...", *arr[0]);
1376
ARMCI_Free(arr[me]);
1383
int rc, bytes, i, val, whatever = -8999;
1386
/* shared variable is located on processor 0 */
1387
bytes = me == 0 ? sizeof(int) : 0;
1389
rc = ARMCI_Malloc((void **)arr, bytes);
1394
*arr[0] = 0; /* initialization */
1399
for (i = 0; i < LOOP; i++) {
1402
rc = ARMCI_Rmw(ARMCI_SWAP, &val, arr[0], whatever, 0);
1405
while (val == LOCKED);
1407
rc = ARMCI_Rmw(ARMCI_SWAP, &val, arr[0], whatever, 0);
1416
printf("The final value is %d, should be %d.\n\n", *arr[0], LOOP * nproc);
1418
if (*arr[0] != LOOP * nproc) {
1419
ARMCI_Error("failed ...", *arr[0]);
1423
ARMCI_Free(arr[me]);
1429
int dim, elems, bytes;
1434
int *proclist = work;
1436
void *pstart, *pend;
1442
bytes = elems * sizeof(double);
1444
/* create shared and local arrays */
1445
create_array((void **)b, sizeof(double), dim, &elems);
1446
a = (double *)malloc(bytes);
1448
c = (double *)malloc(bytes);
1451
/* initialize all elements of array b to zero */
1452
for (i = 0; i < elems; i++) {
1459
for (i = 0; i < ELEMS / 5; i++) {
1464
for (j = 0; j < 10 * TIMES; j++) {
1465
for (i = 0; i < TIMES * nproc; i++) {
1466
first = rand() % (ELEMS / 2);
1467
last = first + ELEMS / 5 - 1;
1468
pstart = b[proc] + first;
1469
pend = b[proc] + last + 1;
1470
elems = last - first + 1;
1471
bytes = sizeof(double) * elems;
1473
armci_lockmem(pstart, pend, proc);
1474
assert(!ARMCI_Put(a, pstart, bytes, proc));
1475
assert(!ARMCI_Get(pstart, c, bytes, proc));
1476
assert(!ARMCI_Get(pstart, c, bytes, proc));
1478
for (k = 0; k < elems; k++)if (a[k] != c[k]) {
1479
printf("%d: error patch (%d:%d) elem=%d val=%f\n", me, first, last, k, c[k]);
1481
ARMCI_Error("failed is ", (int)c[k]);
1486
fprintf(stderr, "done %d\n", j);
1499
destroy_array((void **)b);
1505
int i, elems = nproc, one = 1;
1506
int *idst[MAXPROC], idst_get[MAXPROC], *isrc_get[MAXPROC];
1507
long *ldst[MAXPROC], ldst_get[MAXPROC], *lsrc_get[MAXPROC];
1508
float *fdst[MAXPROC], fdst_get[MAXPROC], *fsrc_get[MAXPROC];
1509
double *ddst[MAXPROC], ddst_get[MAXPROC], *dsrc_get[MAXPROC];
1511
create_array((void **)idst, sizeof(int), 1, &elems);
1512
create_array((void **)ldst, sizeof(long), 1, &elems);
1513
create_array((void **)fdst, sizeof(float), 1, &elems);
1514
create_array((void **)ddst, sizeof(double), 1, &elems);
1515
create_array((void **)isrc_get, sizeof(int), 1, &one);
1516
create_array((void **)lsrc_get, sizeof(long), 1, &one);
1517
create_array((void **)fsrc_get, sizeof(float), 1, &one);
1518
create_array((void **)dsrc_get, sizeof(double), 1, &one);
1520
for (i = 0; i < elems; i++) {
1530
isrc_get[me][0] = 100 * (me + 1);
1531
lsrc_get[me][0] = 100 * (me + 1);
1532
fsrc_get[me][0] = 100.01 * (me + 1);
1533
dsrc_get[me][0] = 100.001 * (me + 1);
1538
for (i = 0; i < nproc; i++) {
1539
ARMCI_PutValueInt(10 *(me + 1), (void *)&idst[i][me], i);
1540
ARMCI_PutValueLong((long)10 *(me + 1), (void *)&ldst[i][me], i);
1541
ARMCI_PutValueFloat(10.01 *(me + 1), (void *)&fdst[i][me], i);
1542
ARMCI_PutValueDouble(10.001 *(me + 1), (void *)&ddst[i][me], i);
1545
for (i = 0; i < nproc; i++) {
1546
idst_get[i] = ARMCI_GetValueInt(isrc_get[i], i);
1547
ldst_get[i] = ARMCI_GetValueLong(lsrc_get[i], i);
1548
fdst_get[i] = ARMCI_GetValueFloat(fsrc_get[i], i);
1549
ddst_get[i] = ARMCI_GetValueDouble(dsrc_get[i], i);
1556
printf("int data type: ");
1558
for (i = 0; i < elems; i++) {
1559
if (idst[me][i] != 10 *(i + 1)) {
1560
ARMCI_Error("Integer register-originated put failed", 0);
1562
if (idst_get[i] != 100 *(i + 1)) {
1563
ARMCI_Error("Integer register-originated get failed", 0);
1568
printf("OK\nlong data type: ");
1570
for (i = 0; i < elems; i++) {
1571
if (ldst[me][i] != 10 *(i + 1)) {
1572
ARMCI_Error("Long register-originated put failed", 0);
1574
if (ldst_get[i] != 100 *(i + 1)) {
1575
ARMCI_Error("Long register-originated get failed", 0);
1579
printf("OK\nfloat data type: ");
1581
for (i = 0; i < elems; i++) {
1582
if (ARMCI_ABS(fdst[me][i] - 10.01 *(i + 1)) > 0.1) {
1583
ARMCI_Error("Float register-originated put failed", 0);
1585
if (ARMCI_ABS(fdst_get[i] - 100.01 *(i + 1)) > 0.1) {
1586
ARMCI_Error("Float register-originated get failed", 0);
1590
printf("OK\ndouble data type: ");
1592
for (i = 0; i < elems; i++) {
1593
if (ARMCI_ABS(ddst[me][i] - 10.001 *(i + 1)) > 0.1) {
1594
ARMCI_Error("Double register-originated put failed", 0);
1596
if (ARMCI_ABS(ddst_get[i] - 100.001 *(i + 1)) > 0.1) {
1597
ARMCI_Error("Double register-originated get failed", 0);
1609
destroy_array((void **)idst);
1610
destroy_array((void **)ldst);
1611
destroy_array((void **)fdst);
1612
destroy_array((void **)ddst);
1614
destroy_array((void **)isrc_get);
1615
destroy_array((void **)lsrc_get);
1616
destroy_array((void **)fsrc_get);
1617
destroy_array((void **)dsrc_get);
1621
#define MAXELEMS 6400
1622
#define NUMAGG 20 /* NUMAGG < MAXELEMS/10 */
1623
#define MAX_REQUESTS 325 /* MAXELEMS/NUMAGG */
1626
void test_aggregate()
1629
int i, j, k, rc, bytes, elems[2] = {MAXPROC, MAXELEMS};
1630
double *ddst_put[MAXPROC];
1631
double *ddst_get[MAXPROC];
1632
double *dsrc[MAXPROC];
1633
armci_hdl_t usr_hdl_put[MAXPROC];
1634
armci_hdl_t usr_hdl_get[MAXPROC];
1636
void *src_ptr[MAX_REQUESTS], *dst_ptr[MAX_REQUESTS];
1637
int start = 0, end = 0;
1639
create_array((void **)ddst_put, sizeof(double), 2, elems);
1640
create_array((void **)ddst_get, sizeof(double), 2, elems);
1641
create_array((void **)dsrc, sizeof(double), 1, &elems[1]);
1643
for (i = 0; i < elems[1]; i++) {
1644
dsrc[me][i] = i * 1.001 * (me + 1);
1646
for (i = 0; i < elems[0]*elems[1]; i++) {
1647
ddst_put[me][i] = 0.0;
1648
ddst_get[me][i] = 0.0;
1652
for (i = 0; i < nproc; i++) {
1653
ARMCI_INIT_HANDLE(&usr_hdl_put[i]);
1655
for (i = 0; i < nproc; i++) {
1656
ARMCI_INIT_HANDLE(&usr_hdl_get[i]);
1658
for (i = 0; i < nproc; i++) {
1659
ARMCI_SET_AGGREGATE_HANDLE(&usr_hdl_put[i]);
1661
for (i = 0; i < nproc; i++) {
1662
ARMCI_SET_AGGREGATE_HANDLE(&usr_hdl_get[i]);
1665
/* Testing aggregate put */
1666
for (i = 0; i < nproc; i++) {
1669
end = COUNT * NUMAGG;
1670
for (j = start; j < end; j++) {
1671
bytes = sizeof(double);
1672
ARMCI_NbPutValueDouble(dsrc[me][j], &ddst_put[i][me*elems[1] + j], i,
1677
end = start + COUNT * NUMAGG;
1678
for (j = start, k = 0; j < end; j += NUMAGG, k++) {
1679
src_ptr[k] = (void *)&dsrc[me][j];
1680
dst_ptr[k] = (void *)&ddst_put[i][me*elems[1] + j];
1682
darr.src_ptr_array = src_ptr;
1683
darr.dst_ptr_array = dst_ptr;
1684
darr.bytes = NUMAGG * sizeof(double);
1685
darr.ptr_array_len = k;
1686
if ((rc = ARMCI_NbPutV(&darr, 1, i, &usr_hdl_put[i]))) {
1687
ARMCI_Error("armci_nbputv failed\n", rc);
1691
end = start + COUNT * NUMAGG;
1692
for (j = start; j < end; j += NUMAGG) {
1693
bytes = sizeof(double) * NUMAGG;
1694
if ((rc = ARMCI_NbPutS(&dsrc[me][j], NULL, &ddst_put[i][me*elems[1] + j], NULL,
1695
&bytes, 0, i, &usr_hdl_put[i]))) {
1696
ARMCI_Error("armci_nbputs failed\n", rc);
1702
for (j = start; j < end; j += NUMAGG) {
1703
bytes = sizeof(double) * NUMAGG;
1704
if ((rc = ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1] + j], bytes,
1705
i, &usr_hdl_put[i]))) {
1706
ARMCI_Error("armci_nbput failed\n", rc);
1712
for (i = 0; i < nproc; i++) {
1713
ARMCI_Wait(&usr_hdl_put[i]);
1716
/* Testing aggregate get */
1717
for (i = 0; i < nproc; i++) {
1720
end = COUNT * NUMAGG;
1721
for (j = start, k = 0; j < end; j += NUMAGG, k++) {
1722
src_ptr[k] = (void *)&dsrc[i][j];
1723
dst_ptr[k] = (void *)&ddst_get[me][i*elems[1] + j];
1725
darr.src_ptr_array = src_ptr;
1726
darr.dst_ptr_array = dst_ptr;
1727
darr.bytes = NUMAGG * sizeof(double);
1728
darr.ptr_array_len = k;
1729
if ((rc = ARMCI_NbGetV(&darr, 1, i, &usr_hdl_get[i]))) {
1730
ARMCI_Error("armci_nbgetv failed\n", rc);
1734
end = start + COUNT * NUMAGG;
1735
for (j = start; j < end; j += NUMAGG) {
1736
bytes = sizeof(double) * NUMAGG;
1737
if ((rc = ARMCI_NbGetS(&dsrc[i][j], NULL, &ddst_get[me][i*elems[1] + j], NULL,
1738
&bytes, 0, i, &usr_hdl_get[i]))) {
1739
ARMCI_Error("armci_nbputs failed\n", rc);
1745
for (j = start; j < end; j += NUMAGG) {
1746
bytes = sizeof(double) * NUMAGG;
1747
if ((rc = ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1] + j], bytes,
1748
i, &usr_hdl_get[i]))) {
1749
ARMCI_Error("armci_nbget failed\n", rc);
1754
for (i = 0; i < nproc; i++) {
1755
ARMCI_Wait(&usr_hdl_get[i]);
1763
for (i = 0; i < nproc; i++) {
1764
for (j = 0; j < elems[1]; j++) {
1765
if (ARMCI_ABS(ddst_put[me][i*elems[1] + j] - j * 1.001 *(i + 1)) > 0.1) {
1766
ARMCI_Error("aggregate put failed...1", 0);
1772
printf(" aggregate put ..O.K.\n");
1776
for (i = 0; i < nproc; i++) {
1777
for (j = 0; j < elems[1]; j++) {
1778
if (ARMCI_ABS(ddst_get[me][i*elems[1] + j] - j * 1.001 *(i + 1)) > 0.1) {
1779
ARMCI_Error("aggregate get failed...1", 0);
1785
printf(" aggregate get ..O.K.\n");
1796
destroy_array((void **)ddst_put);
1797
destroy_array((void **)ddst_get);
1798
destroy_array((void **)dsrc);
1801
void test_implicit()
1804
int i, j, k, rc, bytes, elems[2] = {MAXPROC, MAXELEMS};
1805
double *ddst_put[MAXPROC];
1806
double *ddst_get[MAXPROC];
1807
double *dsrc[MAXPROC];
1809
void *src_ptr[MAX_REQUESTS], *dst_ptr[MAX_REQUESTS];
1810
int start = 0, end = 0;
1811
armci_hdl_t usr_hdl[MAXPROC];
1813
create_array((void **)ddst_put, sizeof(double), 2, elems);
1814
create_array((void **)ddst_get, sizeof(double), 2, elems);
1815
create_array((void **)dsrc, sizeof(double), 1, &elems[1]);
1817
for (i = 0; i < elems[1]; i++) {
1818
dsrc[me][i] = i * 1.001 * (me + 1);
1820
for (i = 0; i < elems[0]*elems[1]; i++) {
1821
ddst_put[me][i] = 0.0;
1822
ddst_get[me][i] = 0.0;
1826
for (i = 0; i < nproc; i++) {
1827
ARMCI_INIT_HANDLE(&usr_hdl[i]);
1830
for (i = 0; i < nproc; i++) {
1833
end = COUNT * NUMAGG;
1834
for (j = start; j < end; j++) {
1835
bytes = sizeof(double);
1836
ARMCI_NbPutValueDouble(dsrc[me][j], &ddst_put[i][me*elems[1] + j], i,
1841
end = start + COUNT * NUMAGG;
1842
for (j = start, k = 0; j < end; j += NUMAGG, k++) {
1843
src_ptr[k] = (void *)&dsrc[me][j];
1844
dst_ptr[k] = (void *)&ddst_put[i][me*elems[1] + j];
1846
darr.src_ptr_array = src_ptr;
1847
darr.dst_ptr_array = dst_ptr;
1848
darr.bytes = NUMAGG * sizeof(double);
1849
darr.ptr_array_len = k;
1850
if ((rc = ARMCI_NbPutV(&darr, 1, i, NULL))) {
1851
ARMCI_Error("armci_nbputv failed\n", rc);
1855
end = start + COUNT * NUMAGG;
1856
for (j = start; j < end; j += NUMAGG) {
1857
bytes = sizeof(double) * NUMAGG;
1858
if ((rc = ARMCI_NbPutS(&dsrc[me][j], NULL, &ddst_put[i][me*elems[1] + j], NULL,
1859
&bytes, 0, i, NULL))) {
1860
ARMCI_Error("armci_nbputs failed\n", rc);
1866
for (j = start; j < end; j += NUMAGG) {
1867
bytes = sizeof(double) * NUMAGG;
1868
if ((rc = ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1] + j], bytes,
1870
ARMCI_Error("armci_nbput failed\n", rc);
1876
for (i = 0; i < nproc; i++) {
1879
end = COUNT * NUMAGG;
1880
for (j = start, k = 0; j < end; j += NUMAGG, k++) {
1881
src_ptr[k] = (void *)&dsrc[i][j];
1882
dst_ptr[k] = (void *)&ddst_get[me][i*elems[1] + j];
1884
darr.src_ptr_array = src_ptr;
1885
darr.dst_ptr_array = dst_ptr;
1886
darr.bytes = NUMAGG * sizeof(double);
1887
darr.ptr_array_len = k;
1888
if ((rc = ARMCI_NbGetV(&darr, 1, i, NULL))) {
1889
ARMCI_Error("armci_nbgetv failed\n", rc);
1893
end = start + COUNT * NUMAGG;
1894
for (j = start; j < end; j += NUMAGG) {
1895
bytes = sizeof(double) * NUMAGG;
1896
if ((rc = ARMCI_NbGetS(&dsrc[i][j], NULL, &ddst_get[me][i*elems[1] + j], NULL,
1897
&bytes, 0, i, NULL))) {
1898
ARMCI_Error("armci_nbputs failed\n", rc);
1904
for (j = start; j < end; j += NUMAGG) {
1905
bytes = sizeof(double) * NUMAGG;
1906
if ((rc = ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1] + j], bytes,
1908
ARMCI_Error("armci_nbget failed\n", rc);
1920
for (i = 0; i < nproc; i++) {
1921
for (j = 0; j < elems[1]; j++) {
1922
if (ARMCI_ABS(ddst_put[me][i*elems[1] + j] - j * 1.001 *(i + 1)) > 0.1) {
1923
ARMCI_Error("implicit handle(s) failed...(a)", 0);
1929
for (i = 0; i < nproc; i++) {
1930
for (j = 0; j < elems[1]; j++) {
1931
if (ARMCI_ABS(ddst_get[me][i*elems[1] + j] - j * 1.001 *(i + 1)) > 0.1) {
1932
ARMCI_Error("implicit handles(s) failed...(b)", 0);
1945
destroy_array((void **)ddst_put);
1946
destroy_array((void **)ddst_get);
1947
destroy_array((void **)dsrc);
1951
int main(int argc, char *argv[])
1955
ARMCI_Init_args(&argc, &argv);
1956
nproc = armci_msg_nproc();
1957
me = armci_msg_me();
1959
/* printf("nproc = %d, me = %d\n", nproc, me);*/
1961
if (nproc > MAXPROC && me == 0) {
1962
ARMCI_Error("Test works for up to %d processors\n", MAXPROC);
1966
printf("ARMCI test program (%d processes)\n", nproc);
1972
if(me==1)armci_die("process 1 committing suicide",1);
1975
printf("\nTesting strided gets and puts\n");
1976
printf("(Only std output for process 0 is printed)\n\n");
1980
for (ndim = 1; ndim <= MAXDIMS; ndim++) {
1987
printf("\nTesting non-blocking gets and puts\n");
1996
printf("\nTesting non-blocking vector gets and puts\n");
2005
printf("\nTesting atomic accumulate\n");
2009
for (ndim = 1; ndim <= MAXDIMS; ndim++) {
2016
printf("\nTesting Vector Interface using triangular patches of a 2-D array\n\n");
2026
printf("\nTesting Accumulate with Vector Interface\n\n");
2036
printf("\nTesting atomic fetch&add\n");
2037
printf("(Std Output for all processes is printed)\n\n");
2049
printf("\nTesting atomic swap\n");
2057
printf("\nTesting register-originated put and get\n");
2066
printf("\nTesting aggregate put/get requests\n");
2071
* Aggregate put/get requests cannot be tested for\ number of procs
2072
* greater than 32. (Current implementation of aggregate put/get
2073
* can use at the maximum of 32 handles (defined by macro
2074
* _MAX_AGG_BUFFERS in aggregate.c). This test case is written in
2075
* such a way that each process puts/gets data to all the other
2076
* processes, thus the number of aggregate handle used is equal to
2077
* the number of processes created.
2081
printf("\n WARNING: Aggregate put/get requests cannot be tested for number of procs greater than 32.\n\n");
2093
printf("\nTesting implicit handles\n");
2107
printf("All tests passed\n");
2115
for (i = 0; i < g_idx; i++)
2116
for (j = 0; j < nproc; j++) {
2117
ARMCI_Memdt(&meminfo[i][j], 0);
2119
for (i = 0; i < g_idx; i++) {
2120
ARMCI_Memctl(&meminfo[i][me]);
2127
armci_msg_finalize();