5
/* $Id: vector.c,v 1.32.6.4 2007-08-29 17:32:32 manoj Exp $ */
14
#define SERVER_NBGET 2
16
#define DIRECT_NBGET 4
18
#define SERVER_NBPUT 6
20
#define DIRECT_NBPUT 8
23
# define DO_FENCE(__proc,__prot) if(__prot==SERVER_GET);\
24
else if(__prot==SERVER_PUT);\
25
else if(__prot==DIRECT_GET || __prot==DIRECT_NBGET){\
26
if(armci_prot_switch_fence[__proc]==SERVER_PUT)\
27
ARMCI_DoFence(__proc);\
29
else if(__prot==DIRECT_PUT || __prot==DIRECT_NBPUT){\
30
if(armci_prot_switch_fence[__proc]==SERVER_PUT)\
31
ARMCI_DoFence(__proc);\
34
armci_prot_switch_fence[__proc]=__prot
49
void I_ACCUMULATE(void* scale, int elems, void*src, void* dst)
52
int *a=(int*)dst, *b=(int*)src;
53
int alpha = *(int*)scale;
55
for(j=0;j<elems;j++) a[j] += alpha*b[j];
60
#define ACCUMULATE( DTYPE, scale, elems, src, dst) {\
62
DTYPE *a =(DTYPE *)(dst);\
63
DTYPE *b =(DTYPE *)(src);\
64
DTYPE alpha = *(DTYPE *)(scale);\
65
for(j=0;j<(elems);j++)a[j] += alpha*b[j];\
68
#define ACCUMULATE_RA( DTYPE, elems, src, dst) {\
70
DTYPE *a =(DTYPE *)(dst);\
71
DTYPE *b =(DTYPE *)(src);\
72
for(j=0;j<(elems);j++)a[j] ^= b[j];\
75
#define CPL_ACCUMULATE( DTYPE, scale, elems, src, dst) {\
77
DTYPE *a =(DTYPE *)(dst);\
78
DTYPE *b =(DTYPE *)(src);\
79
DTYPE alpha = *(DTYPE *)(scale);\
80
for(j=0;j<(elems);j++){\
81
a[j].real += alpha.real*b[j].real - alpha.imag*b[j].imag;\
82
a[j].imag += alpha.imag*b[j].real + alpha.real*b[j].imag;\
86
extern int* armci_prot_switch_fence;
87
extern int armci_prot_switch_preproc;
88
extern int armci_prot_switch_preop;
91
/*\ compute address range for memory to lock
93
void armci_lockmem_scatter(void *ptr_array[], int len, int bytes, int proc)
101
for(i = 0; i< len; i++){
102
pmin = ARMCI_MIN(ptr_array[i],pmin);
103
pmax = ARMCI_MAX(ptr_array[i],pmax);
105
pmax = bytes-1 + (char*)pmax;
106
ARMCI_LOCKMEM(pmin, pmax, proc);
107
/* printf("%d: locked %ld-%ld bytes=%d\n",armci_me,pmin,pmax,
108
1+(char*)pmax -(char*)pmin);fflush(stdout); */
113
void armci_scatter_acc(int op, void *scale, armci_giov_t dsc,
114
int proc, int lockit)
116
# define ITERATOR for(i = 0; i< dsc.ptr_array_len; i++)
119
armci_lockmem_scatter(dsc.dst_ptr_array, dsc.ptr_array_len,
124
elems = dsc.bytes/size;
125
if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
127
ACCUMULATE(int, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
133
elems = dsc.bytes/size;
134
if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
136
ACCUMULATE(long, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
141
size = sizeof(double);
142
elems = dsc.bytes/size;
143
if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
145
ACCUMULATE(double, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
150
size = 2*sizeof(double);
151
elems = dsc.bytes/size;
152
if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
154
CPL_ACCUMULATE(dcomplex_t, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
159
size = 2*sizeof(float);
160
elems = dsc.bytes/size;
161
if(dsc.bytes %size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
163
CPL_ACCUMULATE(complex_t, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
168
size = sizeof(float);
169
elems = dsc.bytes/size;
170
if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
172
ACCUMULATE(float, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
177
elems = dsc.bytes/size;
178
if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
180
ACCUMULATE_RA(long,elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
183
default: armci_die("ARMCI vector accumulate: operation not supported",op);
186
if(lockit) ARMCI_UNLOCKMEM(proc);
191
# define PWORKLEN 2048
192
static void *pwork[PWORKLEN]; /* work array of pointers */
195
int armci_acc_vector(int op, /* operation code */
196
void *scale, /* pointer to scale factor in accumulate */
197
armci_giov_t darr[], /* descriptor array */
198
int len, /* length of descriptor array */
199
int proc /* remote process(or) ID */
204
#if defined(ACC_COPY)
205
if(proc == armci_me ){
207
for(i = 0; i< len; i++) armci_scatter_acc(op, scale, darr[i], proc, 1);
208
#if defined(ACC_COPY)
210
for(i = 0; i< len; i++){
211
armci_giov_t dr = darr[i];
213
if(dr.bytes > BUFSIZE/2){
214
/* for large segments use strided implementation */
215
for(j=0; j< dr.ptr_array_len; j++){
216
rc = armci_acc_copy_strided(op, scale,proc,
217
dr.src_ptr_array[j], NULL, dr.dst_ptr_array[j],NULL,
223
/*lock memory:should optimize it to lock only a chunk at a time*/
224
armci_lockmem_scatter(dr.dst_ptr_array, dr.ptr_array_len, dr.bytes, proc);
225
/* copy as many blocks as possible into the local buffer */
227
nb = ARMCI_MIN(PWORKLEN,BUFSIZE/dr.bytes);
228
for(j=0; j< dr.ptr_array_len; j+= nb){
229
int nblocks = ARMCI_MIN(nb, dr.ptr_array_len -j);
231
/* setup vector descriptor for remote memory copy
232
to bring data into buffer*/
233
dl.ptr_array_len = nblocks;
234
dl.src_ptr_array = dr.dst_ptr_array + j; /* GET destination becomes source for copy */
235
for(k=0; k< nblocks; k++) pwork[k] = k*dl.bytes + (char*)armci_internal_buffer;
236
dl.dst_ptr_array = pwork;
237
/* get data to the local buffer */
238
rc = armci_copy_vector(GET, &dl, 1, proc);
239
if(rc){ ARMCI_UNLOCKMEM(proc); return(rc);}
240
/* update source array for accumulate */
241
dl.src_ptr_array = dr.src_ptr_array +j;
242
/* do scatter accumulate updating copy of data in buffer */
243
armci_scatter_acc(op, scale, dl, armci_me, 0);
244
/* modify descriptor-now source becomes destination for PUT*/
245
dl.dst_ptr_array = dr.dst_ptr_array + j;
246
dl.src_ptr_array = pwork;
248
rc = armci_copy_vector(PUT, &dl, 1, proc);
250
if(rc){ ARMCI_UNLOCKMEM(proc); return(rc);}
252
ARMCI_UNLOCKMEM(proc);
264
int armci_copy_vector(int op, /* operation code */
265
armci_giov_t darr[], /* descriptor array */
266
int len, /* length of descriptor array */
267
int proc /* remote process(or) ID */
270
int i,s,shmem= SAMECLUSNODE(proc);
271
int armci_th_idx = ARMCI_THREAD_IDX;
274
/* local/shared memory copy */
275
for(i = 0; i< len; i++){
276
for( s=0; s< darr[i].ptr_array_len; s++){
277
armci_copy(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s],darr[i].bytes);
285
for(i = 0; i< len; i++){
287
UPDATE_FENCE_STATE(proc, PUT, darr[i].ptr_array_len);
289
for( s=0; s< darr[i].ptr_array_len; s++){
290
armci_put(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s],
291
darr[i].bytes, proc);
296
for(i = 0; i< len; i++){
297
for( s=0; s< darr[i].ptr_array_len; s++){
298
armci_get(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s],
304
armci_die("armci_copy_vector: wrong optype",op);
312
void armci_vector_to_buf(armci_giov_t darr[], int len, void* buf)
315
char *ptr = (char*)buf;
316
for(i = 0; i< len; i++){
317
for( s=0; s< darr[i].ptr_array_len; s++){
318
armci_copy(darr[i].src_ptr_array[s],ptr,darr[i].bytes);
319
ptr += darr[i].bytes;
325
void armci_vector_from_buf(armci_giov_t darr[], int len, void* buf)
328
char *ptr = (char*)buf;
330
for(i = 0; i< len; i++){
331
for( s=0; s< darr[i].ptr_array_len; s++){
332
armci_copy(ptr, darr[i].dst_ptr_array[s],darr[i].bytes);
333
ptr += darr[i].bytes;
338
int PARMCI_PutV( armci_giov_t darr[], /* descriptor array */
339
int len, /* length of descriptor array */
340
int proc /* remote process(or) ID */
343
int rc=0, i,direct=1;
344
if(len<1) return FAIL;
346
if(darr[i].src_ptr_array == NULL || darr[i].dst_ptr_array ==NULL) return FAIL2;
347
if(darr[i].bytes<1)return FAIL3;
348
if(darr[i].ptr_array_len <1) return FAIL4;
351
if(proc<0 || proc >= armci_nproc)return FAIL5;
353
ORDER(PUT,proc); /* ensure ordering */
354
direct=SAMECLUSNODE(proc);
357
if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_PUT);
358
rc = armci_copy_vector(PUT, darr, len, proc);
361
DO_FENCE(proc,SERVER_PUT);
362
rc = armci_pack_vector(PUT, NULL, darr, len, proc,NULL);
371
int PARMCI_GetV( armci_giov_t darr[], /* descriptor array */
372
int len, /* length of descriptor array */
373
int proc /* remote process(or) ID */
376
int rc=0, i,direct=1;
378
if(len<1) return FAIL;
380
if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
381
if(darr[i].bytes<1)return FAIL3;
382
if(darr[i].ptr_array_len <1) return FAIL4;
385
if(proc<0 || proc >= armci_nproc)return FAIL5;
387
ORDER(GET,proc); /* ensure ordering */
389
direct=SAMECLUSNODE(proc);
393
if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_GET);
394
rc = armci_copy_vector(GET, darr, len, proc);
397
DO_FENCE(proc,SERVER_GET);
398
rc = armci_pack_vector(GET, NULL, darr, len, proc,NULL);
408
int PARMCI_AccV( int op, /* oeration code */
409
void *scale, /*scaling factor for accumulate */
410
armci_giov_t darr[], /* descriptor array */
411
int len, /* length of descriptor array */
412
int proc /* remote process(or) ID */
415
int rc=0, i,direct=0;
417
if(len<1) return FAIL;
419
if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
420
if(darr[i].bytes<1)return FAIL3;
421
if(darr[i].ptr_array_len <1) return FAIL4;
424
if(proc<0 || proc >= armci_nproc)return FAIL5;
426
ORDER(op,proc); /* ensure ordering */
427
direct=SAMECLUSNODE(proc);
428
# if defined(ACC_COPY) && !defined(ACC_SMP)
429
if(armci_me != proc) direct=0;
433
rc = armci_acc_vector( op, scale, darr, len, proc);
435
DO_FENCE(proc,SERVER_PUT);
436
rc = armci_pack_vector(op, scale, darr, len, proc,NULL);
444
/*****************************************************************************/
446
/*\ Non-blocking vector API
448
int PARMCI_NbPutV( armci_giov_t darr[], /* descriptor array */
449
int len, /* length of descriptor array */
450
int proc, /* remote process(or) ID */
451
armci_hdl_t* usr_hdl /*non-blocking request handle*/
454
armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl;
455
int rc=0, i,direct=1;
457
if(len<1) return FAIL;
459
if(darr[i].src_ptr_array == NULL || darr[i].dst_ptr_array ==NULL) return FAIL2;
460
if(darr[i].bytes<1)return FAIL3;
461
if(darr[i].ptr_array_len <1) return FAIL4;
464
if(proc<0 || proc >= armci_nproc)return FAIL5;
466
direct=SAMECLUSNODE(proc);
468
if(nb_handle && nb_handle->agg_flag == SET) {
470
rc=armci_agg_save_giov_descriptor(darr, len, proc, PUT, nb_handle);
476
/*ORDER(PUT,proc); ensure ordering */
477
UPDATE_FENCE_INFO(proc);
479
/*set tag and op in the nb handle*/
481
nb_handle->tag = GET_NEXT_NBTAG();
483
nb_handle->proc= proc;
484
nb_handle->bufid=NB_NONE;
487
nb_handle = armci_set_implicit_handle(PUT, proc);
491
if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_PUT);
492
rc = armci_copy_vector(PUT, darr, len, proc);
495
DO_FENCE(proc,SERVER_NBPUT);
496
rc = armci_pack_vector(PUT, NULL, darr, len, proc,nb_handle);
503
int PARMCI_NbGetV( armci_giov_t darr[], /* descriptor array */
504
int len, /* length of descriptor array */
505
int proc, /* remote process(or) ID */
506
armci_hdl_t* usr_hdl /*non-blocking request handle*/
509
armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl;
510
int rc=0, i,direct=1;
512
if(len<1) return FAIL;
514
if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
515
if(darr[i].bytes<1)return FAIL3;
516
if(darr[i].ptr_array_len <1) return FAIL4;
519
if(proc<0 || proc >= armci_nproc)return FAIL5;
521
direct=SAMECLUSNODE(proc);
524
if(nb_handle && nb_handle->agg_flag == SET) {
526
rc=armci_agg_save_giov_descriptor(darr, len, proc, GET, nb_handle);
531
/* ORDER(GET,proc); ensure ordering */
533
nb_handle->tag = GET_NEXT_NBTAG();
535
nb_handle->proc= proc;
536
nb_handle->bufid=NB_NONE;
539
nb_handle = armci_set_implicit_handle(GET, proc);
543
if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_GET);
544
rc = armci_copy_vector(GET, darr, len, proc);
547
DO_FENCE(proc,SERVER_NBGET);
548
rc = armci_pack_vector(GET, NULL, darr, len, proc,nb_handle);
556
int PARMCI_NbAccV( int op, /* oeration code */
557
void *scale, /*scaling factor for accumulate */
558
armci_giov_t darr[], /* descriptor array */
559
int len, /* length of descriptor array */
560
int proc, /* remote process(or) ID */
561
armci_hdl_t* usr_hdl /*non-blocking request handle*/
564
armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl;
565
int rc=0, i,direct=1;
567
if(len<1) return FAIL;
570
if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
571
if(darr[i].bytes<1)return FAIL3;
572
if(darr[i].ptr_array_len <1) return FAIL4;
575
if(proc<0 || proc >= armci_nproc)return FAIL5;
577
UPDATE_FENCE_INFO(proc);
578
direct=SAMECLUSNODE(proc);
581
nb_handle->tag = GET_NEXT_NBTAG();
583
nb_handle->proc= proc;
584
nb_handle->bufid=NB_NONE;
587
nb_handle = armci_set_implicit_handle(op, proc);
589
# if defined(ACC_COPY) && !defined(ACC_SMP)
590
if(armci_me != proc) direct=0;
594
rc = armci_acc_vector( op, scale, darr, len, proc);
596
DO_FENCE(proc,SERVER_NBPUT);
597
rc = armci_pack_vector(op, scale, darr, len, proc,nb_handle);
603
/*****************************************************************************/