8
# define sleep(x) Sleep(1000*(x))
16
# define MPGROUP (char *)NULL
17
# define MP_INIT(arc,argv)
19
# define MPGROUP "mp_working_group"
20
# define MP_INIT(arc,argv) pvm_init(arc, argv)
22
# define MP_FINALIZE() pvm_exit()
23
# define MP_TIMER armci_timer
24
# define MP_BARRIER() pvm_barrier(MPGROUP,-1)
25
# define MP_MYID(pid) *(pid) = pvm_getinst(MPGROUP,pvm_mytid())
26
# define MP_PROCS(pproc) *(pproc) = (int)pvm_gsize(MPGROUP)
27
void pvm_init(int argc, char *argv[]);
31
# define MP_BARRIER() SYNCH_(&tcg_tag)
32
# define MP_INIT(arc,argv) PBEGIN_((argc),(argv))
33
# define MP_FINALIZE() PEND_()
34
# define MP_MYID(pid) *(pid) = (int)NODEID_()
35
# define MP_PROCS(pproc) *(pproc) = (int)NNODES_()
36
# define MP_TIMER TCGTIME_
39
# define MP_BARRIER() MPI_Barrier(MPI_COMM_WORLD)
40
# define MP_FINALIZE() MPI_Finalize()
41
# define MP_INIT(arc,argv) MPI_Init(&(argc),&(argv))
42
# define MP_MYID(pid) MPI_Comm_rank(MPI_COMM_WORLD, (pid))
43
# define MP_PROCS(pproc) MPI_Comm_size(MPI_COMM_WORLD, (pproc));
44
# define MP_TIMER MPI_Wtime
53
/* Solaris has shared memory shortages in the default system configuration */
57
#elif defined(__alpha__)
71
#define EDIM1 (DIM1+OFF)
72
#define EDIM2 (DIM2+OFF)
73
#define EDIM3 (DIM3+OFF)
74
#define EDIM4 (DIM4+OFF)
75
#define EDIM5 (DIM5+OFF)
76
#define EDIM6 (DIM6+OFF)
77
#define EDIM7 (DIM7+OFF)
81
#define MAX_DIM_VAL 50
94
/***************************** macros ************************/
95
#define COPY(src, dst, bytes) memcpy((dst),(src),(bytes))
96
#define MAX(a,b) (((a) >= (b)) ? (a) : (b))
97
#define MIN(a,b) (((a) <= (b)) ? (a) : (b))
98
#define ABS(a) (((a) <0) ? -(a) : (a))
101
#define COL ROW /* square matrices only for the time being */
103
/***************************** global data *******************/
105
short int fortran_indexing=0;
106
static int proc_row_list[MAXPROC];/*no of rows owned by each process - accumulated*/
107
static int proc_nz_list[MAXPROC]; /*no of non-zeros owned by each process */
110
void pvm_init(int argc, char *argv[])
112
int mytid, mygid, ctid[MAXPROC];
116
if((argc != 2) && (argc != 1)) goto usage;
117
if(argc == 1) np = 1;
119
if((np = atoi(argv[1])) < 1) goto usage;
120
if(np > MAXPROC) goto usage;
122
mygid = pvm_joingroup(MPGROUP);
126
i = pvm_spawn(argv[0], argv+1, 0, "", np-1, ctid);
128
while(pvm_gsize(MPGROUP) < np) sleep(1);
131
pvm_barrier(MPGROUP, np);
133
printf("PVM initialization done!\n");
138
fprintf(stderr, "usage: %s <nproc>\n", argv[0]);
144
void create_array(void *a[], int elem_size, int ndim, int dims[])
146
int bytes=elem_size, i, rc;
148
assert(ndim<=MAXDIMS);
149
for(i=0;i<ndim;i++)bytes*=dims[i];
151
rc = ARMCI_Malloc(a, bytes);
158
void destroy_array(void *ptr[])
162
assert(!ARMCI_Free(ptr[me]));
165
static void verify_list(int *proc_row_list) {
167
printf("\nVERIFY: %d: No of rows = %d\n\n", 0, proc_row_list[0]);
168
for(i=1; i<nproc; i++)
169
printf("\nVERIFY: %d: No of rows = %d\n\n", i, proc_row_list[i]-proc_row_list[i-1]);
173
static void load_balance(int n, int non_zero, int *row_ind_tmp) {
175
int proc_id, i, local_nz, local_nz_acc, A, B;
177
local_nz = local_nz_acc = non_zero/nproc;
179
/* number of rows owned by each process is stored in proc_row_list. This
180
is supposed to be well load balanced, so that each process has almost
181
same number of non-zero elements */
183
if(me==0) printf("local_nz = %d\n", local_nz);
184
for(i=0; i<n; i++) { /* as # of entries in row_ind_tmp = n+1 */
185
if(row_ind_tmp[i] < local_nz_acc && row_ind_tmp[i+1] >= local_nz_acc) {
186
proc_row_list[proc_id++] = i+1;
187
local_nz_acc = local_nz*(proc_id+1);
188
if(proc_id == nproc-1) local_nz_acc = non_zero;
189
if(me==0 && proc_id<nproc) printf("local_nz = %d\n", local_nz_acc);
193
proc_row_list[nproc-1] = n;
195
for(i=0; i<nproc; i++) {
196
A = (i==0) ? 0: proc_row_list[i-1];/* # of entries in row_ind_tmp is n+1*/
197
B = proc_row_list[i];
198
proc_nz_list[i] = row_ind_tmp[B]-row_ind_tmp[A];
202
ARMCI_Error("Error while preparing Process Row list", proc_id-1);
205
if(me==0) verify_list(proc_row_list);
210
static int sparse_initialize(int *n, int *non_zero, int **row_ind,
211
int **col_ind, double **values, double **vec,
214
int i, j, rc, max, *row_ind_tmp, *tmp_indices;
219
/* Broadcast order of matrix */
221
if((fp=fopen("Sparse-MPI/av41092.rua.data", "r")) == NULL)
222
ARMCI_Error("Error: Input file not found", me);
223
fortran_indexing = 1; /* This is 1 for Harwell-Boeing format matrices */
226
ARMCI_Error("# of rows is not divisible by # of processors", nproc);
228
ARMCI_Error("order is greater than defined variable ROW", ROW);
231
armci_msg_brdcst(n, len, 0);
233
/* Broad cast number of non_zeros */
234
if(me==0) fscanf(fp, "%d", non_zero);
235
armci_msg_brdcst(non_zero, len, 0);
237
/* Broadcast row indices */
238
len = (*n+1)*sizeof(int);
239
row_ind_tmp = (int *)malloc(len);
240
if(me==0)for(i=0; i<*n+1; i++) {
241
fscanf(fp, "%d", &row_ind_tmp[i]);
242
if(fortran_indexing) --row_ind_tmp[i];
244
armci_msg_brdcst(row_ind_tmp, len, 0);
246
load_balance(*n, *non_zero, row_ind_tmp);
248
/* find how much temporary storage is needed at the maximum */
250
for(max=-1,j=0;j<nproc;j++) if(max<proc_nz_list[j]) max=proc_nz_list[j];
251
if(max<0) ARMCI_Error(" max cannot be negative", max);
254
/* Broadcast the maximum number of elements */
256
armci_msg_brdcst(&max, len, 0);
258
/* create the Sparse MAtrix Array */
259
if(me==0) printf(" Creating ValueArray (CompressedSparseMatrix) ...\n\n");
260
create_array((void**)col_ind, sizeof(int), 1, &max);
262
/* create the column subscript array */
263
if(me==0) printf(" Creating Column Subscript Array ... \n\n");
264
create_array((void**)values, sizeof(double), 1, &max);
266
/* create the x-vector and the solution vector */
267
if(me==0) printf(" Creating Vectors ... \n\n");
268
create_array((void**)vec, sizeof(double),1, &max);
269
create_array((void**)svec, sizeof(double),1, &max);
273
/* Process 0 distributes the column indices and non_zero values to
274
respective processors*/
276
tmp_indices = (int *)malloc(max*sizeof(int));
277
tmp_values = (double *)malloc(max*sizeof(double));
279
for(j=0; j<nproc; j++) {
280
for(i=0; i<proc_nz_list[j]; i++) {
281
fscanf(fp, "%d", &tmp_indices[i]);
282
if(fortran_indexing) --tmp_indices[i];
284
/* rc = fread(tmp_indices, sizeof(int), proc_nz_list[j], fp); */
285
if((rc=ARMCI_Put(tmp_indices, col_ind[j], proc_nz_list[j]*sizeof(int), j)))
286
ARMCI_Error("armci_nbput failed\n",rc);
288
for(j=0; j<nproc; j++) {
289
for(i=0; i<proc_nz_list[j]; i++) fscanf(fp, "%lf", &tmp_values[i]);
290
if((rc=ARMCI_Put(tmp_values, values[j], proc_nz_list[j]*sizeof(double), j)))
291
ARMCI_Error("armci_nbput failed\n",rc);
294
ARMCI_AllFence(); MP_BARRIER();ARMCI_AllFence();
296
/* initializing x-vector */
297
if(me==0) for(i=0;i<proc_nz_list[me]; i++) vec[me][i] = (i+1);
298
else for(i=0;i<proc_nz_list[me];i++) vec[me][i]=me*proc_nz_list[me-1]+(i+1);
302
printf("max = %d\n", max);
303
for(i=0; i<max; i++) printf("%.1lf ", values[me][i]);
308
*row_ind = row_ind_tmp;
317
static int compare(const void *p1, const void *p2) {
318
int i = *((int *)p1);
319
int j = *((int *)p2);
321
if (i > j) return (1);
322
if (i < j) return (-1);
326
static int count = -1;
327
static armci_hdl_t gHandle[MAXPROC];
328
static int prev_proc = -1;
330
static void get_data(int n, int start, int end, double *vec_local,
332
int i, j, rc, bytes, offset;
333
int proc_start, proc_end, idx_start, idx_end;
335
proc_start = proc_end = -1;
336
for(i=0; i<nproc; i++) {
337
if(proc_start<0 && proc_row_list[i]>start) proc_start = i;
338
if(proc_end<0 && proc_row_list[i]>end) proc_end = i;
340
if(proc_start<0 || proc_end<0) ARMCI_Error("Invalid Process Ids", -1);
342
for(i=proc_start; i<=proc_end; i++) {
343
if(i==proc_start) idx_start = start;
344
else { if(i==0) idx_start=0; else idx_start = proc_row_list[i-1];}
345
if(i==proc_end) idx_end = end;
346
else idx_end = proc_row_list[i]-1;
349
++count; prev_proc = i;
350
ARMCI_INIT_HANDLE(&gHandle[count]);
351
ARMCI_SET_AGGREGATE_HANDLE(&gHandle[count]);
354
if(i==0) offset=0; else offset = proc_row_list[i-1];
355
if(i==me) { /* local */
356
for(j=idx_start; j<=idx_end; j++) vec_local[j] = vec[me][j-offset];
359
bytes = (idx_end-idx_start+1)*sizeof(double);
360
vec_local[idx_start] = -1;
362
if((rc=ARMCI_Get(&vec[i][idx_start-offset], &vec_local[idx_start],
365
if((rc=ARMCI_NbGet(&vec[i][idx_start-offset], &vec_local[idx_start],
366
bytes, i, &gHandle[count])))
368
ARMCI_Error("armci_nbget failed\n",rc);
373
static void sparse_multiply(int n, int non_zero, int *row_ind, int **col_ind,
374
double **values, double **vec, double **svec) {
376
int i, j, k, num_elements, offset, *tmp_indices;
377
double start_time, comm_time, comp_time, v, vec_local[COL];
378
int start, end, prev, nrows, idx;
381
/* ---- Sequential Case ---- */
384
for(k=row_ind[i]; k<row_ind[i+1]; k++) {
387
svec[me][i] += v*vec[me][j];
388
printf("%.1lf %.1lf\n", v, vec[me][j]);
391
for(i=0; i<n; i++) printf("%.1lf ", svec[me][i]);
395
num_elements = proc_nz_list[me];
396
printf("num_elements = %d\n", num_elements);
397
tmp_indices = (int *)malloc(num_elements*sizeof(int));
398
for(i=0; i<num_elements; i++) tmp_indices[i] = col_ind[me][i];
399
qsort(tmp_indices, num_elements, sizeof(int), compare);
401
start_time = MP_TIMER();
403
/* get the required portion of vector you need to local array */
404
start = prev = tmp_indices[0];
405
for(i=1; i<num_elements; i++) {
406
if(tmp_indices[i]>prev+1) {
408
get_data(n, start, end, vec_local, vec);
409
start = prev = tmp_indices[i];
411
else prev = tmp_indices[i];
413
get_data(n, start, prev, vec_local, vec);
416
if(count>=0) for(i=0; i<=count; i++) ARMCI_Wait(&gHandle[i]);
419
comm_time = MP_TIMER() - start_time;
420
start_time = MP_TIMER();
422
/* Perform Matrix-Vector multiply and store the result in
423
solution vector - "svec[]" */
425
if(me==0) { nrows = proc_row_list[me]; offset = row_ind[0]; }
427
nrows = proc_row_list[me]-proc_row_list[me-1];
428
offset = row_ind[proc_row_list[me-1]];
430
/* printf("%d: My total Work = %d\n", me, nrows); */
432
for(i=0; i<nrows; i++) { /* loop over rows owned by me */
434
if(me==0) idx = i; else idx = proc_row_list[me-1] + i;
435
for(k=row_ind[idx]; k<row_ind[idx+1]; k++) {
436
j = col_ind[me][k-offset];
437
v = values[me][k-offset];
438
svec[me][i] += v*vec_local[j];
441
comp_time = MP_TIMER()-start_time;
442
printf("%d: %lf + %lf = %lf (count = %d)\n", me, comm_time, comp_time,
443
comm_time+comp_time, count+1);
447
static void gather_solution_vector(double **svec) {
450
if((rc=ARMCI_Get(&vec[i][idx_start-offset], &vec_local[idx_start],
452
ARMCI_Error("armci_nbget failed\n",rc);
456
static void test_sparse() {
458
int *col_ind[MAXPROC];
459
double *values[MAXPROC], *vec[MAXPROC], *svec[MAXPROC], start_time;
460
int n, non_zero, *row_ind;
462
sparse_initialize(&n, &non_zero, &row_ind, col_ind, values, vec, svec);
465
start_time = MP_TIMER();
466
sparse_multiply(n, non_zero, row_ind, col_ind, values, vec, svec);
467
/* printf("%d: Timetaken = %lf\n", me, MP_TIMER()-start_time); */
470
if(me==0) gather_solution_vector(svec);
472
if(me==0){printf("O.K.\n"); fflush(stdout);}
473
destroy_array((void **)vec);
477
/* we need to rename main if linking with frt compiler */
482
int main(int argc, char* argv[])
489
/* printf("nproc = %d, me = %d\n", nproc, me);*/
491
if(nproc>MAXPROC && me==0)
492
ARMCI_Error("Test works for up to %d processors\n",MAXPROC);
495
printf("ARMCI test program (%d processes)\n",nproc);
503
printf("\n Performing Sparse Matrix-Vector Multiplication ...\n\n");
510
if(me==0){printf("\nSuccess!!\n"); fflush(stdout);}