5
/* $Id: test.c,v 1.43.6.6 2007-08-30 22:59:27 manoj Exp $ */
21
/* Solaris has shared memory shortages in the default system configuration */
25
#elif defined(__alpha__)
39
#define EDIM1 (DIM1+OFF)
40
#define EDIM2 (DIM2+OFF)
41
#define EDIM3 (DIM3+OFF)
42
#define EDIM4 (DIM4+OFF)
43
#define EDIM5 (DIM5+OFF)
44
#define EDIM6 (DIM6+OFF)
45
#define EDIM7 (DIM7+OFF)
49
#define MAX_DIM_VAL 50
63
/***************************** macros ************************/
64
#define COPY(src, dst, bytes) memcpy((dst),(src),(bytes))
65
#define COMEX_MAX(a,b) (((a) >= (b)) ? (a) : (b))
66
#define COMEX_MIN(a,b) (((a) <= (b)) ? (a) : (b))
67
#define COMEX_ABS(a) (((a) <0) ? -(a) : (a))
69
/***************************** global data *******************/
71
int work[MAXPROC]; /* work array for propagating addresses */
73
static void all_sum_int(int *x, int n)
75
MPI_Comm comm = MPI_COMM_NULL;
76
MPI_Datatype mpi_type = MPI_INT;
79
MPI_Op mpi_op = MPI_SUM;
81
comex_group_comm(COMEX_GROUP_WORLD, &comm);
83
result = malloc(n*sizeof(int));
86
comex_barrier(COMEX_GROUP_WORLD);
87
rc = MPI_Allreduce(x, result, n, mpi_type, mpi_op, comm);
88
assert(rc == MPI_SUCCESS);
90
memcpy(x, result, sizeof(int) * n);
95
static void all_sum_long(long *x, int n)
97
MPI_Comm comm = MPI_COMM_NULL;
98
MPI_Datatype mpi_type = MPI_LONG;
101
MPI_Op mpi_op = MPI_SUM;
103
comex_group_comm(COMEX_GROUP_WORLD, &comm);
105
result = malloc(n*sizeof(long));
108
comex_barrier(COMEX_GROUP_WORLD);
109
rc = MPI_Allreduce(x, result, n, mpi_type, mpi_op, comm);
110
assert(rc == MPI_SUCCESS);
112
memcpy(x, result, sizeof(long) * n);
117
static double timer()
120
gettimeofday(&tv, NULL);
121
return tv.tv_sec * 1000000.0 + tv.tv_usec;
126
void pvm_init(int argc, char *argv[])
128
int mytid, mygid, ctid[MAXPROC];
132
if ((argc != 2) && (argc != 1)) {
139
if ((np = atoi(argv[1])) < 1) {
147
mygid = pvm_joingroup(MPGROUP);
151
i = pvm_spawn(argv[0], argv + 1, 0, "", np - 1, ctid);
155
while (pvm_gsize(MPGROUP) < np) {
160
pvm_barrier(MPGROUP, np);
162
printf("PVM initialization done!\n");
167
fprintf(stderr, "usage: %s <nproc>\n", argv[0]);
173
/*\ generate random range for a section of multidimensional array
175
void get_range(int ndim, int dims[], int lo[], int hi[])
178
for (dim = 0; dim < ndim; dim++) {
180
toss1 = rand() % dims[dim];
181
toss2 = rand() % dims[dim];
195
/*\ generates a new random range similar to the input range for an array with specified dimensions
197
void new_range(int ndim, int dims[], int lo[], int hi[], int new_lo[], int new_hi[])
200
for (dim = 0; dim < ndim; dim++) {
202
int diff = hi[dim] - lo[dim] + 1;
203
assert(diff <= dims[dim]);
204
range = dims[dim] - diff;
205
toss = (range > 0) ? rand() % range : lo[dim];
207
new_hi[dim] = toss + diff - 1;
208
assert(new_hi[dim] < dims[dim]);
209
assert(diff == (new_hi[dim] - new_lo[dim] + 1));
217
/*\ print range of ndim dimensional array with two strings before and after
219
void print_range(char *pre, int ndim, int lo[], int hi[], char *post)
224
for (i = 0; i < ndim; i++) {
225
printf("%d:%d", lo[i], hi[i]);
227
printf("] %s", post);
235
/*\ print subscript of ndim dimensional array with two strings before and after
237
void print_subscript(char *pre, int ndim, int subscript[], char *post)
242
for (i = 0; i < ndim; i++) {
243
printf("%d", subscript[i]);
245
printf("] %s", post);
254
/*\ print a section of a 2-D array of doubles
256
void print_2D_double(double *a, int ld, int *lo, int *hi)
259
for (i = lo[0]; i <= hi[0]; i++) {
260
for (j = lo[1]; j <= hi[1]; j++) {
261
printf("%13f ", a[ld*j+i]);
268
/*\ initialize array: a[i,j,k,..]=i+100*j+10000*k+ ...
270
void init(double *a, int ndim, int elems, int dims[])
275
for (i = 0; i < elems; i++) {
279
for (dim = 0; dim < ndim; dim++) {
280
idx[dim] = Index % dims[dim];
286
for (dim = 0; dim < ndim; dim++) {
287
val += field * idx[dim];
291
/* printf("(%d,%d,%d)=%6.0f",idx[0],idx[1],idx[2],val); */
296
/*\ compute Index from subscript
297
* assume that first subscript component changes first
299
int Index(int ndim, int subscript[], int dims[])
301
int idx = 0, i, factor = 1;
302
for (i = 0; i < ndim; i++) {
303
idx += subscript[i] * factor;
310
void update_subscript(int ndim, int subscript[], int lo[], int hi[])
313
for (i = 0; i < ndim; i++) {
314
if (subscript[i] < hi[i]) {
318
subscript[i] = lo[i];
324
void compare_patches(double eps, int ndim, double *patch1, int lo1[], int hi1[],
325
int dims1[], double *patch2, int lo2[], int hi2[],
330
int subscr1[MAXDIMS], subscr2[MAXDIMS];
332
int offset1, offset2;
334
for (i = 0; i < ndim; i++) { /* count # of elements & verify consistency of both patches */
335
int diff = hi1[i] - lo1[i];
336
assert(diff == (hi2[i] - lo2[i]));
337
assert(diff < dims1[i]);
338
assert(diff < dims2[i]);
345
/* compare element values in both patches */
346
offset1 = Index(ndim, subscr1, dims1);
347
offset2 = Index(ndim, subscr2, dims2);
348
for (j = 0; j < elems; j++) {
351
idx1 = Index(ndim, subscr1, dims1); /* calculate element Index from a subscript */
352
idx2 = Index(ndim, subscr2, dims2);
358
diff = patch1[idx1] - patch2[idx2];
359
max = COMEX_MAX(COMEX_ABS(patch1[idx1]), COMEX_ABS(patch2[idx2]));
360
if (max == 0. || max < eps) {
364
if (eps < COMEX_ABS(diff) / max) {
366
sprintf(msg, "(proc=%d):%f", me, patch1[idx1]);
367
print_subscript("ERROR: a", ndim, subscr1, msg);
368
sprintf(msg, "%f\n", patch2[idx2]);
369
print_subscript(" b", ndim, subscr2, msg);
372
comex_error("Bailing out", 0);
375
{ /* update subscript for the patches */
376
update_subscript(ndim, subscr1, lo1, hi1);
377
update_subscript(ndim, subscr2, lo2, hi2);
383
/* make sure we reached upper limit */
384
/*for(i=0;i<ndim;i++){
385
assert(subscr1[i]==hi1[i]);
386
assert(subscr2[i]==hi2[i]);
391
void scale_patch(double alpha, int ndim, double *patch1, int lo1[], int hi1[], int dims1[])
394
int subscr1[MAXDIMS];
397
for (i = 0; i < ndim; i++) { /* count # of elements in patch */
398
int diff = hi1[i] - lo1[i];
399
assert(diff < dims1[i]);
404
/* scale element values in both patches */
405
offset1 = Index(ndim, subscr1, dims1);
406
for (j = 0; j < elems; j++) {
408
idx1 = Index(ndim, subscr1, dims1); /* calculate element Index from a subscript */
410
patch1[idx1] *= alpha;
411
update_subscript(ndim, subscr1, lo1, hi1);
417
void create_array(void *a[], int elem_size, int ndim, int dims[])
419
int bytes = elem_size, i, rc;
421
assert(ndim <= MAXDIMS);
422
for (i = 0; i < ndim; i++) {
425
rc = comex_malloc(a, bytes, COMEX_GROUP_WORLD);
431
void destroy_array(void *ptr[])
433
comex_barrier(COMEX_GROUP_WORLD);
435
assert(!comex_free(ptr[me], COMEX_GROUP_WORLD));
440
int loA[MAXDIMS], hiA[MAXDIMS];
441
int dimsA[MAXDIMS] = {DIM1, DIM2, DIM3, DIM4, DIM5, DIM6, DIM7};
442
int loB[MAXDIMS], hiB[MAXDIMS];
443
int dimsB[MAXDIMS] = {EDIM1, EDIM2, EDIM3, EDIM4, EDIM5, EDIM6, EDIM7};
445
int strideA[MAXDIMS], strideB[MAXDIMS];
446
int loC[MAXDIMS], hiC[MAXDIMS];
447
int idx[MAXDIMS] = {0, 0, 0, 0, 0, 0, 0};
450
void test_dim(int ndim)
454
/* double a[DIM4][DIM3][DIM2][DIM1], b[EDIM4][EDIM3][EDIM2][EDIM1];*/
459
strideA[0] = sizeof(double);
460
strideB[0] = sizeof(double);
461
for (i = 0; i < ndim; i++) {
462
strideA[i] *= dimsA[i];
463
strideB[i] *= dimsB[i];
465
strideA[i+1] = strideA[i];
466
strideB[i+1] = strideB[i];
471
/* create shared and local arrays */
472
create_array(b, sizeof(double), ndim, dimsB);
473
a = malloc(sizeof(double) * elems);
475
c = malloc(sizeof(double) * elems);
478
init(a, ndim, elems, dimsA);
481
printf("--------array[%d", dimsA[0]);
482
for (dim = 1; dim < ndim; dim++) {
483
printf(",%d", dimsA[dim]);
485
printf("]--------\n");
489
comex_fence_all(COMEX_GROUP_WORLD);
490
comex_barrier(COMEX_GROUP_WORLD);
491
for (i = 0; i < LOOP; i++) {
492
int idx1, idx2, idx3;
493
get_range(ndim, dimsA, loA, hiA);
494
new_range(ndim, dimsB, loA, hiA, loB, hiB);
495
new_range(ndim, dimsA, loA, hiA, loC, hiC);
497
proc = nproc - 1 - me;
500
print_range("local", ndim, loA, hiA, "-> ");
501
print_range("remote", ndim, loB, hiB, "-> ");
502
print_range("local", ndim, loC, hiC, "\n");
505
idx1 = Index(ndim, loA, dimsA);
506
idx2 = Index(ndim, loB, dimsB);
507
idx3 = Index(ndim, loC, dimsA);
509
for (j = 0; j < ndim; j++) {
510
count[j] = hiA[j] - loA[j] + 1;
513
count[0] *= sizeof(double); /* convert range to bytes at stride level zero */
515
(void)comex_puts((double *)a + idx1, strideA, (double *)b[proc] + idx2, strideB, count, ndim - 1, proc, COMEX_GROUP_WORLD);
519
/* printf("%d: a=(%x,%f) b=(%x,%f)\n",me,idx1 + (double*)a,*(idx1 + (double*)a),idx2 + (double*)b,*(idx2 + (double*)b));*/
523
/* note that we do not need comex_fence here since
524
* consectutive operations targeting the same process are ordered */
525
(void)comex_gets((double *)b[proc] + idx2, strideB, (double *)c + idx3, strideA, count, ndim - 1, proc, COMEX_GROUP_WORLD);
527
compare_patches(0., ndim, (double *)a + idx1, loA, hiA, dimsA, (double *)c + idx3, loC, hiC, dimsA);
537
int nloA[MAXDIMS+1][MAXDIMS], nhiA[MAXDIMS+1][MAXDIMS];
538
int nloB[MAXDIMS+1][MAXDIMS], nhiB[MAXDIMS+1][MAXDIMS];
539
int nloC[MAXDIMS+1][MAXDIMS], nhiC[MAXDIMS+1][MAXDIMS];
541
int get_next_RRproc(int initialize, int ndim)
546
distance = nproc / 2;
547
if ((nproc % 2) != 0) {
555
/*send it to a different process everytime*/
556
proc = (me <= ((nproc % 2 == 0) ? ((nproc / 2) - 1) : (nproc / 2))) ? (me + distance) : (me - distance);
557
if ((nproc % 2) != 0 && me == (nproc / 2)) {
561
if (me < (nproc / 2)) {
563
if ((me + distance) >= nproc) {
564
distance = nproc / 2;
565
if ((nproc % 2) != 0) {
573
if ((me - distance) >= (nproc / 2)) {
574
distance = nproc / 2;
575
if ((nproc % 2) != 0) {
578
distance = distance + (me - distance);
581
if (ndim != 1 && MAXDIMS > nproc && (ndim % (nproc / 2) == 0)) {
582
distance = nproc / 2;
583
if ((nproc % 2) != 0) {
593
int elems = 1, elems1 = 1;
594
int i, j, proc, ndim, rc;
595
void *b[MAXDIMS+1][MAXPROC];
596
void *a[MAXDIMS+1], *c[MAXDIMS+1];
597
comex_request_t hdl_put[MAXDIMS+1], hdl_get[MAXDIMS+1];
598
int idx1 = 0, idx2 = 0, idx3 = 0;
599
/* create shared and local arrays */
600
for (ndim = 1; ndim <= MAXDIMS; ndim++) {
601
elems1 *= dimsB[ndim-1];
602
elems *= dimsA[ndim-1];
603
rc = comex_malloc(b[ndim], sizeof(double) * elems1, COMEX_GROUP_WORLD);
606
a[ndim] = malloc(sizeof(double) * elems);
608
c[ndim] = malloc(sizeof(double) * elems);
610
init(a[ndim], ndim, elems, dimsA);
612
comex_fence_all(COMEX_GROUP_WORLD);
613
comex_barrier(COMEX_GROUP_WORLD);
615
(void)get_next_RRproc(1, 0);
616
for (ndim = 1; ndim <= MAXDIMS; ndim++) {
617
strideA[0] = sizeof(double);
618
strideB[0] = sizeof(double);
619
for (i = 0; i < ndim; i++) {
620
strideA[i] *= dimsA[i];
621
strideB[i] *= dimsB[i];
623
strideA[i+1] = strideA[i];
624
strideB[i+1] = strideB[i];
627
proc = get_next_RRproc(0, ndim);
628
get_range(ndim, dimsA, nloA[ndim], nhiA[ndim]);
629
new_range(ndim, dimsB, nloA[ndim], nhiA[ndim], nloB[ndim],
631
new_range(ndim, dimsA, nloA[ndim], nhiA[ndim], nloC[ndim],
634
print_range("local", ndim, nloA[ndim], nhiA[ndim], "-> ");
635
print_range("remote", ndim, nloB[ndim], nhiB[ndim], "-> ");
636
print_range("local", ndim, nloC[ndim], nhiC[ndim], "\n");
641
idx1 = Index(ndim, nloA[ndim], dimsA);
642
idx2 = Index(ndim, nloB[ndim], dimsB);
643
idx3 = Index(ndim, nloC[ndim], dimsA);
644
for (j = 0; j < ndim; j++) {
645
count[j] = nhiA[ndim][j] - nloA[ndim][j] + 1;
647
count[0] *= sizeof(double);
650
(void)comex_nbput((double *)a[ndim] + idx1, (double *)b[ndim][proc] + idx2,
651
count[0], proc, COMEX_GROUP_WORLD, (hdl_put + ndim));
654
(void)comex_nbputs((double *)a[ndim] + idx1, strideA,
655
(double *)b[ndim][proc] + idx2,
656
strideB, count, ndim - 1, proc, COMEX_GROUP_WORLD, (hdl_put + ndim));
660
comex_barrier(COMEX_GROUP_WORLD);
661
/*before we do gets, we have to make sure puts are complete
662
on the remote processor*/
663
for (ndim = 1; ndim <= MAXDIMS; ndim++) {
664
comex_wait(hdl_put + ndim);
666
comex_barrier(COMEX_GROUP_WORLD);
667
comex_fence_all(COMEX_GROUP_WORLD);
669
(void)get_next_RRproc(1, 0);
671
for (ndim = 1; ndim <= MAXDIMS; ndim++) {
672
strideA[0] = sizeof(double);
673
strideB[0] = sizeof(double);
674
for (i = 0; i < ndim; i++) {
675
strideA[i] *= dimsA[i];
676
strideB[i] *= dimsB[i];
678
strideA[i+1] = strideA[i];
679
strideB[i+1] = strideB[i];
682
/*send it to a different process everytime*/
683
proc = get_next_RRproc(0, ndim);
685
idx1 = Index(ndim, nloA[ndim], dimsA);
686
idx2 = Index(ndim, nloB[ndim], dimsB);
687
idx3 = Index(ndim, nloC[ndim], dimsA);
688
for (j = 0; j < ndim; j++) {
689
count[j] = nhiA[ndim][j] - nloA[ndim][j] + 1;
691
count[0] *= sizeof(double);
693
(void)comex_nbget((double *)b[ndim][proc] + idx2, (double *)c[ndim] + idx3,
694
count[0], proc, COMEX_GROUP_WORLD, (hdl_get + ndim));
697
(void)comex_nbgets((double *)b[ndim][proc] + idx2, strideB,
698
(double *)c[ndim] + idx3,
699
strideA, count, ndim - 1, proc, COMEX_GROUP_WORLD, (hdl_get + ndim));
703
comex_barrier(COMEX_GROUP_WORLD);
705
printf("Now waiting for all non-blocking calls and verifying data...\n");
708
for (ndim = 1; ndim <= MAXDIMS; ndim++) {
709
comex_wait(hdl_get + ndim);
710
idx1 = Index(ndim, nloA[ndim], dimsA);
711
idx2 = Index(ndim, nloB[ndim], dimsB);
712
idx3 = Index(ndim, nloC[ndim], dimsA);
713
compare_patches(0., ndim, (double *)a[ndim] + idx1, nloA[ndim], nhiA[ndim],
714
dimsA, (double *)c[ndim] + idx3, nloC[ndim], nhiC[ndim], dimsA);
721
for (ndim = 1; ndim <= MAXDIMS; ndim++) {
722
destroy_array(b[ndim]);
728
#define PTR_ARR_LEN 10
730
#define VEC_ELE_LEN 20 /*number of doubles in each dimention*/
731
#define GIOV_ARR_LEN 9
733
void verify_vector_data(double *data, int procs, int isput, int datalen)
736
int i, j = 0, k = 0, kc = 0, dst = 0;
740
for (i = 0; i < datalen; i++) {
742
if (COMEX_ABS((data[i] - (me + facto + dst)*((kc + 1)*(j % PTR_ARR_LEN + 1)))) > 0.001) {
743
printf("\n%d:while verifying data of a op from proc=%d ", me, dst);
744
printf("giov index=%d ptr_arr_index=%d \n :element index=%d", kc,
745
(j % PTR_ARR_LEN), k);
746
printf(" elem was supposed to be %f but is %f",
747
(me + facto + dst)*((kc + 1)*(j % PTR_ARR_LEN + 1)) , data[i]);
750
comex_error("vector non-blocking failed", 0);
753
if (k == VEC_ELE_LEN) {
756
if (j % PTR_ARR_LEN == 0) {
758
if ((kc % GIOV_ARR_LEN) == 0) {
767
void test_vec_small()
771
comex_giov_t dsc[MAXPROC*GIOV_ARR_LEN];
772
void **psrc; /*arrays of pointers to be used by giov_t*/
774
void *getsrc[MAXPROC]; /*to allocate mem via comex_malloc*/
775
void *putdst[MAXPROC]; /*to allocate mem via comex_malloc*/
776
comex_request_t hdl_put[MAXPROC], hdl_get[MAXPROC];
777
int i = 0, j = 0, k = 0, kc = 0, kcold = 0, rc, dstproc, dst = 0;
780
lenpergiov = PTR_ARR_LEN * VEC_ELE_LEN;
781
rc = comex_malloc(getsrc, sizeof(double) * nproc * GIOV_ARR_LEN * lenpergiov, COMEX_GROUP_WORLD);
784
rc = comex_malloc(putdst, sizeof(double) * nproc * GIOV_ARR_LEN * lenpergiov, COMEX_GROUP_WORLD);
788
/*first malloc for getdst and putsrc, both are 2d arrays*/
789
getdst = (double *)malloc(sizeof(double) * nproc * GIOV_ARR_LEN * lenpergiov);
790
putsrc = (double **)malloc(sizeof(double *) * nproc * GIOV_ARR_LEN * PTR_ARR_LEN);
793
for (i = 0; i < nproc * GIOV_ARR_LEN * PTR_ARR_LEN; i++) {
794
putsrc[i] = (double *)malloc(sizeof(double) * VEC_ELE_LEN);
797
/*allocating memory for psrc and pdst*/
798
psrc = (void **)malloc(sizeof(void *) * PTR_ARR_LEN * nproc * GIOV_ARR_LEN);
799
pdst = (void **)malloc(sizeof(void *) * PTR_ARR_LEN * nproc * GIOV_ARR_LEN);
803
for (i = 0; i < nproc * lenpergiov * GIOV_ARR_LEN; i++) {
804
putsrc[j][k] = (me + 1.89 + dst) * ((kc + 1) * ((j % PTR_ARR_LEN) + 1));
805
((double *)getsrc[me])[i] = (me + 2.89 + dst) * ((kc + 1) * (j % PTR_ARR_LEN + 1));
807
if (k == VEC_ELE_LEN) {
810
if ((j % PTR_ARR_LEN) == 0) {
812
if ((kc % GIOV_ARR_LEN) == 0) {
819
/*********************Testing NbPutV*********************************/
825
for (i = 0; i < nproc - 1; i++) {
827
if (dstproc == nproc) {
830
for (j = 0; j < GIOV_ARR_LEN; j++) {
832
for (k = 0; k < PTR_ARR_LEN; k++, kc++) {
834
psrc[kc] = (void *)putsrc[PTR_ARR_LEN*(dstproc*GIOV_ARR_LEN+j)+k];
835
ptr = (double *)putdst[dstproc];
836
pdst[kc] = (void *)(ptr + lenpergiov * (GIOV_ARR_LEN * me + j) + k * VEC_ELE_LEN);
838
dsc[j].bytes = VEC_ELE_LEN * sizeof(double);
839
dsc[j].src = &psrc[kcold];
840
dsc[j].dst = &pdst[kcold];
841
dsc[j].count = PTR_ARR_LEN;
843
if ((rc = comex_nbputv(dsc, GIOV_ARR_LEN, dstproc, COMEX_GROUP_WORLD, hdl_put + dstproc))) {
844
comex_error("putv failed", rc);
848
printf("\n\tNow veryfying the vector put data for correctness");
850
for (i = 0; i < nproc; i++)if (i != me) {
851
comex_wait(hdl_put + i);
854
comex_barrier(COMEX_GROUP_WORLD);
855
comex_fence_all(COMEX_GROUP_WORLD);
856
verify_vector_data((double *)putdst[me], nproc, 1, nproc * GIOV_ARR_LEN * lenpergiov);
858
printf("\n\tPuts OK\n");
860
/****************Done Testing NbPutV*********************************/
862
/*********************Testing NbGetV*********************************/
868
for (i = 0; i < nproc - 1; i++) {
870
if (dstproc == nproc) {
873
for (j = 0; j < GIOV_ARR_LEN; j++) {
875
for (k = 0; k < PTR_ARR_LEN; k++, kc++) {
878
pdst[kc] = (void *)(ptr + lenpergiov * (dstproc * GIOV_ARR_LEN + j) + k * VEC_ELE_LEN);
879
ptr = (double *)(getsrc[dstproc]);
880
psrc[kc] = (void *)(ptr + lenpergiov * (me * GIOV_ARR_LEN + j) + k * VEC_ELE_LEN);
882
dsc[j].bytes = VEC_ELE_LEN * sizeof(double);
883
dsc[j].src = &psrc[kcold];
884
dsc[j].dst = &pdst[kcold];
885
dsc[j].count = PTR_ARR_LEN;
887
if ((rc = comex_nbgetv(dsc, GIOV_ARR_LEN, dstproc, COMEX_GROUP_WORLD, hdl_get + dstproc))) {
888
comex_error("putv failed", rc);
892
printf("\n\tNow veryfying the vector get data for correctness");
894
for (i = 0; i < nproc; i++)if (i != me) {
895
comex_wait(hdl_get + i);
898
comex_barrier(COMEX_GROUP_WORLD);
899
verify_vector_data((double *)getdst, nproc, 0, nproc * GIOV_ARR_LEN * lenpergiov);
901
printf("\n\tGets OK\n");
903
/****************Done Testing NbGetV*********************************/
907
for (i = 0; i < nproc * GIOV_ARR_LEN * PTR_ARR_LEN; i++) {
911
comex_free(getsrc[me], COMEX_GROUP_WORLD);
912
comex_free(putdst[me], COMEX_GROUP_WORLD);
917
void GetPermutedProcList(int *ProcList)
921
if (nproc > MAXPROC) {
922
comex_error("permute_proc: nproc to big ", nproc);
925
/* initialize list */
926
for (i = 0; i < nproc; i++) {
933
/* every process generates different random sequence */
934
(void)srand((unsigned)me);
936
/* list permutation generated by random swapping */
937
for (i = 0; i < nproc; i++) {
938
iswap = (int)(rand() % nproc);
939
temp = ProcList[iswap];
940
ProcList[iswap] = ProcList[i];
947
/*\ Atomic Accumulate test: remote += alpha*local
948
* Every process/or has its patch of array b updated TIMES*NPROC times.
949
* The sequence of updates is random: everybody uses a randomly permuted list
950
* and accumulate is non-collective (of-course)
952
void test_acc(int ndim)
958
double alpha = 0.1, scale;
960
int *proclist = work;
963
strideA[0] = sizeof(double);
964
strideB[0] = sizeof(double);
965
for (i = 0; i < ndim; i++) {
966
strideA[i] *= dimsA[i];
967
strideB[i] *= dimsB[i];
969
strideA[i+1] = strideA[i];
970
strideB[i+1] = strideB[i];
974
/* set up patch coordinates: same on every processor */
977
loB[i] = dimsB[i] - 2;
979
count[i] = hiA[i] - loA[i] + 1;
982
/* create shared and local arrays */
983
create_array(b, sizeof(double), ndim, dimsB);
984
a = malloc(sizeof(double) * elems);
986
c = malloc(sizeof(double) * elems);
989
init(a, ndim, elems, dimsA);
992
printf("--------array[%d", dimsA[0]);
993
for (dim = 1; dim < ndim; dim++) {
994
printf(",%d", dimsA[dim]);
996
printf("]--------\n");
999
GetPermutedProcList(proclist);
1001
idx1 = Index(ndim, loA, dimsA);
1002
idx2 = Index(ndim, loB, dimsB);
1003
count[0] *= sizeof(double); /* convert range to bytes at stride level zero */
1005
/* initialize all elements of array b to zero */
1007
for (i = 0; i < ndim; i++) {
1010
for (i = 0; i < elems; i++) {
1011
((double *)b[me])[i] = 0.;
1017
print_range("patch", ndim, loA, hiA, " -> ");
1018
print_range("patch", ndim, loB, hiB, "\n");
1022
comex_fence_all(COMEX_GROUP_WORLD);
1023
comex_barrier(COMEX_GROUP_WORLD);
1024
for (i = 0; i < TIMES * nproc; i++) {
1025
proc = proclist[i%nproc];
1026
(void)comex_accs(COMEX_ACC_DBL, &alpha, (double *)a + idx1, strideA,
1027
(double *)b[proc] + idx2, strideB, count, ndim - 1, proc, COMEX_GROUP_WORLD);
1031
comex_fence_all(COMEX_GROUP_WORLD);
1032
comex_barrier(COMEX_GROUP_WORLD);
1034
/* copy my patch into local array c */
1035
(void)comex_gets((double *)b[me] + idx2, strideB, (double *)c + idx1, strideA, count, ndim - 1, me, COMEX_GROUP_WORLD);
1037
scale = alpha * TIMES * nproc;
1039
scale_patch(scale, ndim, (double *)a + idx1, loA, hiA, dimsA);
1041
compare_patches(.0001, ndim, (double *)a + idx1, loA, hiA, dimsA, (double *)c + idx1, loA, hiA, dimsA);
1042
comex_barrier(COMEX_GROUP_WORLD);
1055
/*************************** vector interface *********************************\
1056
* tests vector interface for transfers of triangular sections of a 2-D array *
1057
******************************************************************************/
1060
int dim, elems, ndim, cols, rows, mrc;
1066
comex_giov_t dsc[MAX_DIM_VAL];
1067
void *psrc[MAX_DIM_VAL];
1068
void *pdst[MAX_DIM_VAL];
1072
for (i = 0; i < ndim; i++) {
1073
dimsA[i] = MAX_DIM_VAL;
1074
dimsB[i] = MAX_DIM_VAL + 1;
1078
/* create shared and local arrays */
1079
create_array(b, sizeof(double), ndim, dimsB);
1080
a = malloc(sizeof(double) * elems);
1082
c = malloc(sizeof(double) * elems);
1085
init(a, ndim, elems, dimsA);
1088
printf("--------array[%d", dimsA[0]);
1089
for (dim = 1; dim < ndim; dim++) {
1090
printf(",%d", dimsA[dim]);
1092
printf("]--------\n");
1096
for (loop = 0; loop < LOOP; loop++) {
1097
get_range(ndim, dimsA, loA, hiA);
1098
new_range(ndim, dimsB, loA, hiA, loB, hiB);
1099
new_range(ndim, dimsA, loA, hiA, loC, hiC);
1101
proc = nproc - 1 - me;
1104
print_range("local", ndim, loA, hiA, "-> ");
1105
print_range("remote", ndim, loB, hiB, "-> ");
1106
print_range("local", ndim, loC, hiC, "\n");
1109
/* printf("array at source\n");*/
1110
/* print_2D_double((double *)a, dimsA[0], loA, hiA);*/
1112
cols = hiA[1] - loA[1] + 1;
1113
rows = hiA[0] - loA[0] + 1;
1114
mrc = COMEX_MIN(cols, rows);
1116
/* generate a data descriptor for a lower-triangular patch */
1117
for (i = 0; i < mrc; i++) {
1123
idx = Index(ndim, ij, dimsA);
1124
psrc[i] = (double *)a + idx;
1128
idx = Index(ndim, ij, dimsB);
1129
pdst[i] = (double *)b[proc] + idx;
1131
dsc[i].bytes = (rows - i) * sizeof(double);
1132
dsc[i].src = &psrc[i];
1133
dsc[i].dst = &pdst[i];
1135
/* assume each element different in size (not true in rectangular patches) */
1139
if ((rc = comex_putv(dsc, mrc, proc, COMEX_GROUP_WORLD))) {
1140
comex_error("putv failed ", rc);
1143
/* printf("array at destination\n");*/
1144
/* print_2D_double((double *)b[proc], dimsB[0], loB, hiB);*/
1146
/* generate a data descriptor for the upper-triangular patch */
1147
/* there is one less element since diagonal is excluded */
1148
for (i = 1; i < cols; i++) {
1153
psrc[i-1] = (double *)a + Index(ndim, ij, dimsA);
1157
pdst[i-1] = (double *)b[proc] + Index(ndim, ij, dimsB);
1159
mrc = COMEX_MIN(i, rows);
1160
dsc[i-1].bytes = mrc * sizeof(double);
1161
dsc[i-1].src = &psrc[i-1];
1162
dsc[i-1].dst = &pdst[i-1];
1164
/* assume each element different in size (not true in rectangular patches) */
1168
if ((cols - 1))if ((rc = comex_putv(dsc, cols - 1, proc, COMEX_GROUP_WORLD))) {
1169
comex_error("putv(2) failed ", rc);
1172
/* we get back entire rectangular patch */
1173
for (i = 0; i < cols; i++) {
1177
psrc[i] = (double *)b[proc] + Index(ndim, ij, dimsB);
1181
pdst[i] = (double *)c + Index(ndim, ij, dimsA);
1184
dsc[0].bytes = rows * sizeof(double);
1187
dsc[0].count = cols;
1189
/* note that we do not need comex_fence here since
1190
* consecutive operations targeting the same process are ordered */
1191
if ((rc = comex_getv(dsc, 1, proc, COMEX_GROUP_WORLD))) {
1192
comex_error("getv failed ", rc);
1195
idx1 = Index(ndim, loA, dimsA);
1196
idx3 = Index(ndim, loC, dimsA);
1197
compare_patches(0., ndim, (double *)a + idx1, loA, hiA, dimsA, (double *)c + idx3, loC, hiC, dimsA);
1207
/*\ Atomic Accumulate test for vector API: remote += alpha*local
1208
* Every process/or has its patch of array b updated TIMES*NPROC times.
1209
* The sequence of updates is random: everybody uses a randomly permuted list
1210
* and accumulate is non-collective (of-course)
1212
void test_vector_acc()
1214
int dim, elems, bytes;
1215
int i, j, proc, rc, one = 1;
1217
void *psrc[ELEMS/2], *pdst[ELEMS/2];
1219
double alpha = 0.1, scale;
1220
int *proclist = work;
1225
bytes = sizeof(double) * elems;
1227
/* create shared and local arrays */
1228
create_array(b, sizeof(double), dim, &elems);
1234
init(a, dim, elems, &elems);
1237
printf("--------array[%d", elems);
1238
printf("]--------\n");
1242
GetPermutedProcList(proclist);
1244
/* initialize all elements of array b to zero */
1245
for (i = 0; i < elems; i++) {
1246
((double *)b[me])[i] = 0.;
1251
dsc.bytes = sizeof(double);
1254
dsc.count = elems / 2;
1257
comex_barrier(COMEX_GROUP_WORLD);
1258
for (i = 0; i < TIMES * nproc; i++) {
1260
/* proc=proclist[i%nproc];*/
1263
/* accumulate even numbered elements */
1264
for (j = 0; j < elems / 2; j++) {
1265
psrc[j] = 2 * j + (double *)a;
1266
pdst[j] = 2 * j + (double *)b[proc];
1268
if ((rc = comex_accv(COMEX_ACC_DBL, &alpha, &dsc, 1, proc, COMEX_GROUP_WORLD))) {
1269
comex_error("accumlate failed", rc);
1271
/* for(j=0; j<elems; j++)
1272
printf("%d %lf %lf\n",j, *(j+ (double*)b[proc]), *(j+ (double*)a));
1274
/* accumulate odd numbered elements */
1275
for (j = 0; j < elems / 2; j++) {
1276
psrc[j] = 2 * j + 1 + (double *)a;
1277
pdst[j] = 2 * j + 1 + (double *)b[proc];
1279
(void)comex_accv(COMEX_ACC_DBL, &alpha, &dsc, 1, proc, COMEX_GROUP_WORLD);
1281
/* for(j=0; j<elems; j++)
1282
printf("%d %lf %lf\n",j, *(j+ (double*)a), *(j+ (double*)b[proc]));
1286
comex_fence_all(COMEX_GROUP_WORLD);
1287
comex_barrier(COMEX_GROUP_WORLD);
1289
/* copy my patch into local array c */
1290
assert(!comex_get((double *)b[proc], c, bytes, proc, COMEX_GROUP_WORLD));
1292
/* scale = alpha*TIMES*nproc; */
1293
scale = alpha * TIMES * nproc * nproc;
1294
scale_patch(scale, dim, a, &one, &elems, &elems);
1296
compare_patches(.0001, dim, a, &one, &elems, &elems, c, &one, &elems, &elems);
1297
comex_barrier(COMEX_GROUP_WORLD);
1305
destroy_array((void **)b);
1311
void test_fetch_add()
1313
int rc, bytes, i, val, times = 0;
1315
int gop_val[MAXPROC];
1316
int gop_times[MAXPROC];
1318
/* shared variable is located on processor 0 */
1319
bytes = me == 0 ? sizeof(int) : 0;
1321
rc = comex_malloc((void **)arr, bytes, COMEX_GROUP_WORLD);
1323
comex_barrier(COMEX_GROUP_WORLD);
1326
*arr[0] = 0; /* initialization */
1329
comex_barrier(COMEX_GROUP_WORLD);
1331
rc = comex_rmw(COMEX_FETCH_AND_ADD, &val, arr[0], 1, 0, COMEX_GROUP_WORLD);
1334
/* show what everybody gets */
1335
(void)memset(gop_val, 0, MAXPROC*sizeof(int));
1337
all_sum_int(gop_val, nproc);
1339
for (i = 0; i < nproc; i++) {
1340
printf("process %d got value of %d\n", i, gop_val[i]);
1345
printf("\nIncrement the shared counter until reaches %d\n", LOOP);
1349
comex_barrier(COMEX_GROUP_WORLD);
1351
/* now increment the counter value until reaches LOOP */
1352
while (val < LOOP) {
1353
rc = comex_rmw(COMEX_FETCH_AND_ADD, &val, arr[0], 1, 0, COMEX_GROUP_WORLD);
1358
/* show what everybody gets */
1359
(void)memset(gop_val, 0, MAXPROC*sizeof(int));
1360
(void)memset(gop_times, 0, MAXPROC*sizeof(int));
1362
gop_times[me] = times;
1363
all_sum_int(gop_val, nproc);
1364
all_sum_int(gop_times, nproc);
1366
for (i = 0; i < nproc; i++) {
1367
printf("process %d incremented the counter %d times value=%d\n",
1368
i, gop_times[i], gop_val[i]);
1373
*arr[0] = 0; /* set it back to 0 */
1376
printf("\nNow everybody increments the counter %d times\n", LOOP);
1380
comex_fence_all(COMEX_GROUP_WORLD);
1381
comex_barrier(COMEX_GROUP_WORLD);
1383
for (i = 0; i < LOOP; i++) {
1384
rc = comex_rmw(COMEX_FETCH_AND_ADD, &val, arr[0], 1, 0, COMEX_GROUP_WORLD);
1388
comex_fence_all(COMEX_GROUP_WORLD);
1389
comex_barrier(COMEX_GROUP_WORLD);
1392
printf("The final value is %d, should be %d.\n\n", *arr[0], LOOP * nproc);
1394
if (*arr[0] != LOOP * nproc) {
1395
comex_error("failed ...", *arr[0]);
1399
comex_free(arr[me], COMEX_GROUP_WORLD);
1403
void test_fetch_add_long()
1405
long rc, bytes, i, val, times = 0;
1407
long gop_val[MAXPROC];
1408
long gop_times[MAXPROC];
1410
/* shared variable is located on processor 0 */
1411
bytes = me == 0 ? sizeof(long) : 0;
1413
rc = comex_malloc((void **)arr, bytes, COMEX_GROUP_WORLD);
1415
comex_barrier(COMEX_GROUP_WORLD);
1418
*arr[0] = 0; /* initialization */
1421
comex_barrier(COMEX_GROUP_WORLD);
1423
rc = comex_rmw(COMEX_FETCH_AND_ADD_LONG, &val, arr[0], 1, 0, COMEX_GROUP_WORLD);
1426
/* show what everybody gets */
1427
(void)memset(gop_val, 0, MAXPROC*sizeof(long));
1429
all_sum_long(gop_val, nproc);
1431
for (i = 0; i < nproc; i++) {
1432
printf("process %ld got value of %ld\n", i, gop_val[i]);
1437
printf("\nIncrement the shared counter until reaches %d\n", LOOP);
1441
comex_barrier(COMEX_GROUP_WORLD);
1443
/* now increment the counter value until reaches LOOP */
1444
while (val < LOOP) {
1445
rc = comex_rmw(COMEX_FETCH_AND_ADD_LONG, &val, arr[0], 1, 0, COMEX_GROUP_WORLD);
1450
/* show what everybody gets */
1451
(void)memset(gop_val, 0, MAXPROC*sizeof(long));
1452
(void)memset(gop_times, 0, MAXPROC*sizeof(long));
1454
gop_times[me] = times;
1455
all_sum_long(gop_val, nproc);
1456
all_sum_long(gop_times, nproc);
1458
for (i = 0; i < nproc; i++) {
1459
printf("process %ld incremented the counter %ld times value=%ld\n",
1460
i, gop_times[i], gop_val[i]);
1465
*arr[0] = 0; /* set it back to 0 */
1468
printf("\nNow everybody increments the counter %d times\n", LOOP);
1472
comex_fence_all(COMEX_GROUP_WORLD);
1473
comex_barrier(COMEX_GROUP_WORLD);
1475
for (i = 0; i < LOOP; i++) {
1476
rc = comex_rmw(COMEX_FETCH_AND_ADD_LONG, &val, arr[0], 1, 0, COMEX_GROUP_WORLD);
1480
comex_fence_all(COMEX_GROUP_WORLD);
1481
comex_barrier(COMEX_GROUP_WORLD);
1484
printf("The final value is %ld, should be %d.\n\n", *arr[0], LOOP * nproc);
1486
if (*arr[0] != LOOP * nproc) {
1487
comex_error("failed ...", *arr[0]);
1491
comex_free(arr[me], COMEX_GROUP_WORLD);
1498
int rc, bytes, i, val, whatever = -8999;
1501
/* shared variable is located on processor 0 */
1502
bytes = me == 0 ? sizeof(int) : 0;
1504
rc = comex_malloc((void **)arr, bytes, COMEX_GROUP_WORLD);
1506
comex_barrier(COMEX_GROUP_WORLD);
1509
*arr[0] = 0; /* initialization */
1512
comex_fence_all(COMEX_GROUP_WORLD);
1513
comex_barrier(COMEX_GROUP_WORLD);
1514
for (i = 0; i < LOOP; i++) {
1517
rc = comex_rmw(COMEX_SWAP, &val, arr[0], whatever, 0, COMEX_GROUP_WORLD);
1520
while (val == LOCKED);
1522
rc = comex_rmw(COMEX_SWAP, &val, arr[0], whatever, 0, COMEX_GROUP_WORLD);
1527
comex_fence_all(COMEX_GROUP_WORLD);
1528
comex_barrier(COMEX_GROUP_WORLD);
1531
printf("The final value is %d, should be %d.\n\n", *arr[0], LOOP * nproc);
1533
if (*arr[0] != LOOP * nproc) {
1534
comex_error("failed ...", *arr[0]);
1538
comex_free(arr[me], COMEX_GROUP_WORLD);
1543
void test_swap_long()
1545
long rc, bytes, i, val, whatever = -8999;
1548
/* shared variable is located on processor 0 */
1549
bytes = me == 0 ? sizeof(long) : 0;
1551
rc = comex_malloc((void **)arr, bytes, COMEX_GROUP_WORLD);
1553
comex_barrier(COMEX_GROUP_WORLD);
1556
*arr[0] = 0; /* initialization */
1559
comex_fence_all(COMEX_GROUP_WORLD);
1560
comex_barrier(COMEX_GROUP_WORLD);
1561
for (i = 0; i < LOOP; i++) {
1564
rc = comex_rmw(COMEX_SWAP_LONG, &val, arr[0], whatever, 0, COMEX_GROUP_WORLD);
1567
while (val == LOCKED);
1569
rc = comex_rmw(COMEX_SWAP_LONG, &val, arr[0], whatever, 0, COMEX_GROUP_WORLD);
1574
comex_fence_all(COMEX_GROUP_WORLD);
1575
comex_barrier(COMEX_GROUP_WORLD);
1578
printf("The final value is %ld, should be %d.\n\n", *arr[0], LOOP * nproc);
1580
if (*arr[0] != LOOP * nproc) {
1581
comex_error("failed ...", *arr[0]);
1585
comex_free(arr[me], COMEX_GROUP_WORLD);
1589
int main(int argc, char *argv[])
1593
comex_init_args(&argc, &argv);
1594
comex_group_rank(COMEX_GROUP_WORLD, &me);
1595
comex_group_size(COMEX_GROUP_WORLD, &nproc);
1597
/* printf("nproc = %d, me = %d\n", nproc, me);*/
1599
if (nproc > MAXPROC && me == 0) {
1600
comex_error("Test works for up to %d processors\n", MAXPROC);
1604
printf("COMEX test program (%d processes)\n", nproc);
1610
if(me==1)comex_die("process 1 committing suicide",1);
1613
printf("\nTesting strided gets and puts\n");
1614
printf("(Only std output for process 0 is printed)\n\n");
1618
for (ndim = 1; ndim <= MAXDIMS; ndim++) {
1621
comex_fence_all(COMEX_GROUP_WORLD);
1622
comex_barrier(COMEX_GROUP_WORLD);
1625
printf("\nTesting non-blocking gets and puts\n");
1629
double timer_test_nbdim = timer();
1631
comex_fence_all(COMEX_GROUP_WORLD);
1632
comex_barrier(COMEX_GROUP_WORLD);
1633
timer_test_nbdim = timer() - timer_test_nbdim;
1635
printf("timer_test_nbdim=%f\n", timer_test_nbdim);
1639
printf("\nTesting non-blocking vector gets and puts\n");
1643
double timer_test_vec_small = timer();
1645
comex_fence_all(COMEX_GROUP_WORLD);
1646
comex_barrier(COMEX_GROUP_WORLD);
1647
timer_test_vec_small = timer() - timer_test_vec_small;
1649
printf("timer_test_vec_small=%f\n", timer_test_vec_small);
1653
printf("\nTesting atomic accumulate\n");
1657
for (ndim = 1; ndim <= MAXDIMS; ndim++) {
1660
comex_fence_all(COMEX_GROUP_WORLD);
1661
comex_barrier(COMEX_GROUP_WORLD);
1664
printf("\nTesting Vector Interface using triangular patches of a 2-D array\n\n");
1670
comex_fence_all(COMEX_GROUP_WORLD);
1671
comex_barrier(COMEX_GROUP_WORLD);
1674
printf("\nTesting Accumulate with Vector Interface\n\n");
1678
double test_vector_acc_timer = timer();
1680
test_vector_acc_timer = timer() - test_vector_acc_timer;
1682
printf("test_vector_acc_timer=%f\n", test_vector_acc_timer);
1685
comex_fence_all(COMEX_GROUP_WORLD);
1686
comex_barrier(COMEX_GROUP_WORLD);
1689
printf("\nTesting atomic fetch&add\n");
1690
printf("(Std Output for all processes is printed)\n\n");
1694
comex_barrier(COMEX_GROUP_WORLD);
1699
printf("\nTesting atomic fetch&add long\n");
1700
printf("(Std Output for all processes is printed)\n\n");
1704
comex_barrier(COMEX_GROUP_WORLD);
1706
test_fetch_add_long();
1708
comex_fence_all(COMEX_GROUP_WORLD);
1709
comex_barrier(COMEX_GROUP_WORLD);
1712
printf("\nTesting atomic swap\n");
1716
comex_fence_all(COMEX_GROUP_WORLD);
1717
comex_barrier(COMEX_GROUP_WORLD);
1720
printf("\nTesting atomic swap long\n");
1724
comex_fence_all(COMEX_GROUP_WORLD);
1725
comex_barrier(COMEX_GROUP_WORLD);
1728
printf("\nTesting aggregate put/get requests\n");
1732
comex_barrier(COMEX_GROUP_WORLD);
1734
comex_barrier(COMEX_GROUP_WORLD);
1736
printf("All tests passed\n");
1741
comex_barrier(COMEX_GROUP_WORLD);