1
/* $Id: perf_aggr.c,v 1.7 2004-07-21 00:23:06 manoj Exp $ */
8
# define sleep(x) Sleep(1000*(x))
13
/* ARMCI is impartial to message-passing libs - we handle them with MP macros */
17
# define MPGROUP (char *)NULL
18
# define MP_INIT(arc,argv)
20
# define MPGROUP "mp_working_group"
21
# define MP_INIT(arc,argv) pvm_init(arc, argv)
23
# define MP_FINALIZE() pvm_exit()
24
# define MP_TIMER armci_timer
25
# define MP_BARRIER() pvm_barrier(MPGROUP,-1)
26
# define MP_MYID(pid) *(pid) = pvm_getinst(MPGROUP,pvm_mytid())
27
# define MP_PROCS(pproc) *(pproc) = (int)pvm_gsize(MPGROUP)
28
void pvm_init(int argc, char *argv[]);
32
# define MP_BARRIER() SYNCH_(&tcg_tag)
33
# define MP_INIT(arc,argv) PBEGIN_((argc),(argv))
34
# define MP_FINALIZE() PEND_()
35
# define MP_MYID(pid) *(pid) = (int)NODEID_()
36
# define MP_PROCS(pproc) *(pproc) = (int)NNODES_()
37
# define MP_TIMER TCGTIME_
40
# define MP_BARRIER() MPI_Barrier(MPI_COMM_WORLD)
41
# define MP_FINALIZE() MPI_Finalize()
42
# define MP_INIT(arc,argv) MPI_Init(&(argc),&(argv))
43
# define MP_MYID(pid) MPI_Comm_rank(MPI_COMM_WORLD, (pid))
44
# define MP_PROCS(pproc) MPI_Comm_size(MPI_COMM_WORLD, (pproc));
45
# define MP_TIMER MPI_Wtime
53
/* Solaris has shared memory shortages in the default system configuration */
57
#elif defined(__alpha__)
71
#define EDIM1 (DIM1+OFF)
72
#define EDIM2 (DIM2+OFF)
73
#define EDIM3 (DIM3+OFF)
74
#define EDIM4 (DIM4+OFF)
75
#define EDIM5 (DIM5+OFF)
76
#define EDIM6 (DIM6+OFF)
77
#define EDIM7 (DIM7+OFF)
81
#define MAX_DIM_VAL 50
95
/***************************** macros ************************/
96
#define COPY(src, dst, bytes) memcpy((dst),(src),(bytes))
97
#define MAX(a,b) (((a) >= (b)) ? (a) : (b))
98
#define MIN(a,b) (((a) <= (b)) ? (a) : (b))
99
#define ABS(a) (((a) <0) ? -(a) : (a))
101
/***************************** global data *******************/
103
void* work[MAXPROC]; /* work array for propagating addresses */
108
void pvm_init(int argc, char *argv[])
110
int mytid, mygid, ctid[MAXPROC];
114
if((argc != 2) && (argc != 1)) goto usage;
115
if(argc == 1) np = 1;
117
if((np = atoi(argv[1])) < 1) goto usage;
118
if(np > MAXPROC) goto usage;
120
mygid = pvm_joingroup(MPGROUP);
124
i = pvm_spawn(argv[0], argv+1, 0, "", np-1, ctid);
126
while(pvm_gsize(MPGROUP) < np) sleep(1);
129
pvm_barrier(MPGROUP, np);
131
printf("PVM initialization done!\n");
136
fprintf(stderr, "usage: %s <nproc>\n", argv[0]);
142
void create_array(void *a[], int elem_size, int ndim, int dims[])
144
int bytes=elem_size, i, rc;
146
assert(ndim<=MAXDIMS);
147
for(i=0;i<ndim;i++)bytes*=dims[i];
149
rc = ARMCI_Malloc(a, bytes);
156
void destroy_array(void *ptr[])
160
assert(!ARMCI_Free(ptr[me]));
163
#define MAXELEMS 1000
164
#define MAX_REQUESTS MAXELEMS
166
void test_aggregate(int dryrun) {
168
int i, j, rc, bytes, elems[2] = {MAXPROC, MAXELEMS};
169
double *ddst_put[MAXPROC];
170
double *ddst_get[MAXPROC];
171
double *dsrc[MAXPROC];
172
armci_hdl_t aggr_hdl_put[MAXPROC];
173
armci_hdl_t aggr_hdl_get[MAXPROC];
174
armci_hdl_t hdl_put[MAXELEMS];
175
armci_hdl_t hdl_get[MAXELEMS];
177
void *src_ptr[MAX_REQUESTS], *dst_ptr[MAX_REQUESTS];
178
int start = 0, end = 0;
181
create_array((void**)ddst_put, sizeof(double),2, elems);
182
create_array((void**)ddst_get, sizeof(double),2, elems);
183
create_array((void**)dsrc, sizeof(double),1, &elems[1]);
185
for(i=0; i<elems[1]; i++) dsrc[me][i]=i*1.001*(me+1);
186
for(i=0; i<elems[0]*elems[1]; i++) {
193
/* only proc 0 does the work */
195
if(!dryrun)printf("Transferring %d doubles (Not an array of %d doubles)\n", MAXELEMS, MAXELEMS);
197
/* initializing non-blocking handles */
198
for(i=0; i<elems[1]; i++) ARMCI_INIT_HANDLE(&hdl_put[i]);
199
for(i=0; i<elems[1]; i++) ARMCI_INIT_HANDLE(&hdl_get[i]);
201
/* aggregate handles */
202
for(i=0; i<nproc; i++) ARMCI_INIT_HANDLE(&aggr_hdl_put[i]);
203
for(i=0; i<nproc; i++) ARMCI_INIT_HANDLE(&aggr_hdl_get[i]);
204
for(i=0; i<nproc; i++) ARMCI_SET_AGGREGATE_HANDLE(&aggr_hdl_put[i]);
205
for(i=0; i<nproc; i++) ARMCI_SET_AGGREGATE_HANDLE(&aggr_hdl_get[i]);
207
bytes = sizeof(double);
209
/* **************** PUT **************** */
211
start_time=MP_TIMER();
212
start = 0; end = elems[1];
213
for(i=1; i<nproc; i++) {
214
for(j=start; j<end; j++) {
215
ARMCI_NbPutValueDouble(dsrc[me][j], &ddst_put[i][me*elems[1]+j], i,
218
for(j=start; j<end; j++) ARMCI_Wait(&hdl_put[j]);
220
if(!dryrun)printf("%d: Value Put time = %.2es\n", me, MP_TIMER()-start_time);
223
start_time=MP_TIMER();
224
for(i=1; i<nproc; i++) {
225
for(j=start; j<end; j++) {
226
src_ptr[j] = (void *)&dsrc[me][j];
227
dst_ptr[j] = (void *)&ddst_put[i][me*elems[1]+j];
229
darr.src_ptr_array = src_ptr;
230
darr.dst_ptr_array = dst_ptr;
231
darr.bytes = sizeof(double);
232
darr.ptr_array_len = elems[1];
233
if((rc=ARMCI_NbPutV(&darr, 1, i, &hdl_put[i])))
234
ARMCI_Error("armci_nbputv failed\n",rc);
236
for(i=1; i<nproc; i++) ARMCI_Wait(&hdl_put[i]);
237
if(!dryrun)printf("%d: Vector Put time = %.2es\n", me, MP_TIMER()-start_time);
240
start_time=MP_TIMER();
241
for(i=1; i<nproc; i++) {
242
for(j=start; j<end; j++) {
243
if((rc=ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1]+j], bytes,
245
ARMCI_Error("armci_nbput failed\n",rc);
247
for(j=start; j<end; j++) ARMCI_Wait(&hdl_put[j]);
249
if(!dryrun)printf("%d: Regular Put time = %.2es\n", me, MP_TIMER()-start_time);
252
start_time=MP_TIMER();
253
for(i=1; i<nproc; i++) {
254
for(j=start; j<end; j++) {
255
if((rc=ARMCI_NbPut(&dsrc[me][j], &ddst_put[i][me*elems[1]+j], bytes,
256
i, &aggr_hdl_put[i])))
257
ARMCI_Error("armci_nbput failed\n",rc);
260
for(i=1; i<nproc; i++) ARMCI_Wait(&aggr_hdl_put[i]);
261
if(!dryrun)printf("%d: Aggregate Put time = %.2es\n\n", me, MP_TIMER()-start_time);
264
/* **************** GET **************** */
267
start_time=MP_TIMER();
268
for(i=1; i<nproc; i++) {
269
for(j=start; j<end; j++) {
270
src_ptr[j] = (void *)&dsrc[i][j];
271
dst_ptr[j] = (void *)&ddst_get[me][i*elems[1]+j];
273
darr.src_ptr_array = src_ptr;
274
darr.dst_ptr_array = dst_ptr;
275
darr.bytes = sizeof(double);
276
darr.ptr_array_len = elems[1];
277
if((rc=ARMCI_NbGetV(&darr, 1, i, &hdl_get[i])))
278
ARMCI_Error("armci_nbgetv failed\n",rc);
279
ARMCI_Wait(&hdl_get[i]);
281
if(!dryrun)printf("%d: Vector Get time = %.2es\n", me, MP_TIMER()-start_time);
284
start_time=MP_TIMER();
285
for(i=1; i<nproc; i++) {
286
for(j=start; j<end; j++) {
287
if((rc=ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1]+j], bytes,
289
ARMCI_Error("armci_nbget failed\n",rc);
291
for(j=start; j<end; j++) ARMCI_Wait(&hdl_get[j]);
293
if(!dryrun)printf("%d: Regular Get time = %.2es\n", me, MP_TIMER()-start_time);
296
start_time=MP_TIMER();
297
for(i=1; i<nproc; i++) {
298
for(j=start; j<end; j++) {
299
ARMCI_NbGet(&dsrc[i][j], &ddst_get[me][i*elems[1]+j], bytes,
300
i, &aggr_hdl_get[i]);
303
for(i=1; i<nproc; i++) ARMCI_Wait(&aggr_hdl_get[i]);
304
if(!dryrun)printf("%d: Aggregate Get time = %.2es\n", me, MP_TIMER()-start_time);
313
for(j=0; j<elems[1]; j++) {
314
if( ABS(ddst_put[me][j]-j*1.001) > 0.1) {
315
ARMCI_Error("aggregate put failed...1", 0);
319
if(!dryrun)if(me==0) printf("\n aggregate put ..O.K.\n"); fflush(stdout);
322
for(i=1; i<nproc; i++) {
323
for(j=0; j<elems[1]; j++) {
324
if( ABS(ddst_get[me][i*elems[1]+j]-j*1.001*(i+1)) > 0.1) {
325
ARMCI_Error("aggregate get failed...1", 0);
331
if(!dryrun)if(me==0) printf(" aggregate get ..O.K.\n"); fflush(stdout);
337
if(!dryrun)if(me==0){printf("O.K.\n"); fflush(stdout);}
338
destroy_array((void **)ddst_put);
339
destroy_array((void **)ddst_get);
340
destroy_array((void **)dsrc);
344
/* we need to rename main if linking with frt compiler */
349
int main(int argc, char* argv[])
356
/* printf("nproc = %d, me = %d\n", nproc, me);*/
358
if(nproc>MAXPROC && me==0)
359
ARMCI_Error("Test works for up to %d processors\n",MAXPROC);
362
printf("ARMCI test program (%d processes)\n",nproc);
370
printf("\nAggregate put/get requests\n\n");
373
test_aggregate(1); /* cold start */
374
test_aggregate(0); /* warm start */
378
if(me==0){printf("\nSuccess!!\n"); fflush(stdout);}