5
/* $Id: vector.c,v 1.32.6.4 2007-08-29 17:32:32 manoj Exp $ */
14
#define SERVER_NBGET 2
16
#define DIRECT_NBGET 4
18
#define SERVER_NBPUT 6
20
#define DIRECT_NBPUT 8
23
# define DO_FENCE(__proc,__prot) if(__prot==SERVER_GET);\
24
else if(__prot==SERVER_PUT);\
25
else if(__prot==DIRECT_GET || __prot==DIRECT_NBGET){\
26
if(armci_prot_switch_fence[__proc]==SERVER_PUT)\
27
ARMCI_DoFence(__proc);\
29
else if(__prot==DIRECT_PUT || __prot==DIRECT_NBPUT){\
30
if(armci_prot_switch_fence[__proc]==SERVER_PUT)\
31
ARMCI_DoFence(__proc);\
34
armci_prot_switch_fence[__proc]=__prot
36
/* defined in acc.h so don't redefine here
37
#ifndef ARMCI_COMPLEX_TYPES
51
void I_ACCUMULATE(void* scale, int elems, void*src, void* dst)
54
int *a=(int*)dst, *b=(int*)src;
55
int alpha = *(int*)scale;
57
for(j=0;j<elems;j++) a[j] += alpha*b[j];
62
#define ACCUMULATE( DTYPE, scale, elems, src, dst) {\
64
DTYPE *a =(DTYPE *)(dst);\
65
DTYPE *b =(DTYPE *)(src);\
66
DTYPE alpha = *(DTYPE *)(scale);\
67
for(j=0;j<(elems);j++)a[j] += alpha*b[j];\
70
#define ACCUMULATE_RA( DTYPE, elems, src, dst) {\
72
DTYPE *a =(DTYPE *)(dst);\
73
DTYPE *b =(DTYPE *)(src);\
74
for(j=0;j<(elems);j++)a[j] ^= b[j];\
77
#define CPL_ACCUMULATE( DTYPE, scale, elems, src, dst) {\
79
DTYPE *a =(DTYPE *)(dst);\
80
DTYPE *b =(DTYPE *)(src);\
81
DTYPE alpha = *(DTYPE *)(scale);\
82
for(j=0;j<(elems);j++){\
83
a[j].real += alpha.real*b[j].real - alpha.imag*b[j].imag;\
84
a[j].imag += alpha.imag*b[j].real + alpha.real*b[j].imag;\
88
extern int* armci_prot_switch_fence;
89
extern int armci_prot_switch_preproc;
90
extern int armci_prot_switch_preop;
93
/*\ compute address range for memory to lock
95
void armci_lockmem_scatter(void *ptr_array[], int len, int bytes, int proc)
103
for(i = 0; i< len; i++){
104
pmin = ARMCI_MIN(ptr_array[i],pmin);
105
pmax = ARMCI_MAX(ptr_array[i],pmax);
107
pmax = bytes-1 + (char*)pmax;
108
ARMCI_LOCKMEM(pmin, pmax, proc);
109
/* printf("%d: locked %ld-%ld bytes=%d\n",armci_me,pmin,pmax,
110
1+(char*)pmax -(char*)pmin);fflush(stdout); */
115
void armci_scatter_acc(int op, void *scale, armci_giov_t dsc,
116
int proc, int lockit)
118
# define ITERATOR for(i = 0; i< dsc.ptr_array_len; i++)
121
armci_lockmem_scatter(dsc.dst_ptr_array, dsc.ptr_array_len,
126
elems = dsc.bytes/size;
127
if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
129
ACCUMULATE(int, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
135
elems = dsc.bytes/size;
136
if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
138
ACCUMULATE(long, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
143
size = sizeof(double);
144
elems = dsc.bytes/size;
145
if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
147
ACCUMULATE(double, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
152
size = 2*sizeof(double);
153
elems = dsc.bytes/size;
154
if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
156
CPL_ACCUMULATE(dcomplex_t, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
161
size = 2*sizeof(float);
162
elems = dsc.bytes/size;
163
if(dsc.bytes %size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
165
CPL_ACCUMULATE(complex_t, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
170
size = sizeof(float);
171
elems = dsc.bytes/size;
172
if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
174
ACCUMULATE(float, scale, elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
179
elems = dsc.bytes/size;
180
if(dsc.bytes%size) armci_die("ARMCI vector accumulate: bytes not consistent with datatype",dsc.bytes);
182
ACCUMULATE_RA(long,elems, dsc.src_ptr_array[i], dsc.dst_ptr_array[i])
185
default: armci_die("ARMCI vector accumulate: operation not supported",op);
188
if(lockit) ARMCI_UNLOCKMEM(proc);
193
# define PWORKLEN 2048
194
static void *pwork[PWORKLEN]; /* work array of pointers */
197
int armci_acc_vector(int op, /* operation code */
198
void *scale, /* pointer to scale factor in accumulate */
199
armci_giov_t darr[], /* descriptor array */
200
int len, /* length of descriptor array */
201
int proc /* remote process(or) ID */
206
#if defined(ACC_COPY)
207
if(proc == armci_me ){
209
for(i = 0; i< len; i++) armci_scatter_acc(op, scale, darr[i], proc, 1);
210
#if defined(ACC_COPY)
212
for(i = 0; i< len; i++){
213
armci_giov_t dr = darr[i];
215
if(dr.bytes > BUFSIZE/2){
216
/* for large segments use strided implementation */
217
for(j=0; j< dr.ptr_array_len; j++){
218
rc = armci_acc_copy_strided(op, scale,proc,
219
dr.src_ptr_array[j], NULL, dr.dst_ptr_array[j],NULL,
225
/*lock memory:should optimize it to lock only a chunk at a time*/
226
armci_lockmem_scatter(dr.dst_ptr_array, dr.ptr_array_len, dr.bytes, proc);
227
/* copy as many blocks as possible into the local buffer */
229
nb = ARMCI_MIN(PWORKLEN,BUFSIZE/dr.bytes);
230
for(j=0; j< dr.ptr_array_len; j+= nb){
231
int nblocks = ARMCI_MIN(nb, dr.ptr_array_len -j);
233
/* setup vector descriptor for remote memory copy
234
to bring data into buffer*/
235
dl.ptr_array_len = nblocks;
236
dl.src_ptr_array = dr.dst_ptr_array + j; /* GET destination becomes source for copy */
237
for(k=0; k< nblocks; k++) pwork[k] = k*dl.bytes + (char*)armci_internal_buffer;
238
dl.dst_ptr_array = pwork;
239
/* get data to the local buffer */
240
rc = armci_copy_vector(GET, &dl, 1, proc);
241
if(rc){ ARMCI_UNLOCKMEM(proc); return(rc);}
242
/* update source array for accumulate */
243
dl.src_ptr_array = dr.src_ptr_array +j;
244
/* do scatter accumulate updating copy of data in buffer */
245
armci_scatter_acc(op, scale, dl, armci_me, 0);
246
/* modify descriptor-now source becomes destination for PUT*/
247
dl.dst_ptr_array = dr.dst_ptr_array + j;
248
dl.src_ptr_array = pwork;
250
rc = armci_copy_vector(PUT, &dl, 1, proc);
252
if(rc){ ARMCI_UNLOCKMEM(proc); return(rc);}
254
ARMCI_UNLOCKMEM(proc);
266
int armci_copy_vector(int op, /* operation code */
267
armci_giov_t darr[], /* descriptor array */
268
int len, /* length of descriptor array */
269
int proc /* remote process(or) ID */
272
int i,s,shmem= SAMECLUSNODE(proc);
273
int armci_th_idx = ARMCI_THREAD_IDX;
276
/* local/shared memory copy */
277
for(i = 0; i< len; i++){
278
for( s=0; s< darr[i].ptr_array_len; s++){
279
armci_copy(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s],darr[i].bytes);
287
for(i = 0; i< len; i++){
289
UPDATE_FENCE_STATE(proc, PUT, darr[i].ptr_array_len);
291
for( s=0; s< darr[i].ptr_array_len; s++){
292
armci_put(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s],
293
darr[i].bytes, proc);
298
for(i = 0; i< len; i++){
299
for( s=0; s< darr[i].ptr_array_len; s++){
300
armci_get(darr[i].src_ptr_array[s],darr[i].dst_ptr_array[s],
306
armci_die("armci_copy_vector: wrong optype",op);
314
void armci_vector_to_buf(armci_giov_t darr[], int len, void* buf)
317
char *ptr = (char*)buf;
318
for(i = 0; i< len; i++){
319
for( s=0; s< darr[i].ptr_array_len; s++){
320
armci_copy(darr[i].src_ptr_array[s],ptr,darr[i].bytes);
321
ptr += darr[i].bytes;
327
void armci_vector_from_buf(armci_giov_t darr[], int len, void* buf)
330
char *ptr = (char*)buf;
332
for(i = 0; i< len; i++){
333
for( s=0; s< darr[i].ptr_array_len; s++){
334
armci_copy(ptr, darr[i].dst_ptr_array[s],darr[i].bytes);
335
ptr += darr[i].bytes;
340
int PARMCI_PutV( armci_giov_t darr[], /* descriptor array */
341
int len, /* length of descriptor array */
342
int proc /* remote process(or) ID */
345
int rc=0, i,direct=1;
346
if(len<1) return FAIL;
348
if(darr[i].src_ptr_array == NULL || darr[i].dst_ptr_array ==NULL) return FAIL2;
349
if(darr[i].bytes<1)return FAIL3;
350
if(darr[i].ptr_array_len <1) return FAIL4;
353
if(proc<0 || proc >= armci_nproc)return FAIL5;
355
ORDER(PUT,proc); /* ensure ordering */
356
direct=SAMECLUSNODE(proc);
359
if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_PUT);
360
rc = armci_copy_vector(PUT, darr, len, proc);
363
DO_FENCE(proc,SERVER_PUT);
364
rc = armci_pack_vector(PUT, NULL, darr, len, proc,NULL);
373
int PARMCI_GetV( armci_giov_t darr[], /* descriptor array */
374
int len, /* length of descriptor array */
375
int proc /* remote process(or) ID */
378
int rc=0, i,direct=1;
380
if(len<1) return FAIL;
382
if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
383
if(darr[i].bytes<1)return FAIL3;
384
if(darr[i].ptr_array_len <1) return FAIL4;
387
if(proc<0 || proc >= armci_nproc)return FAIL5;
389
ORDER(GET,proc); /* ensure ordering */
391
direct=SAMECLUSNODE(proc);
395
if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_GET);
396
rc = armci_copy_vector(GET, darr, len, proc);
399
DO_FENCE(proc,SERVER_GET);
400
rc = armci_pack_vector(GET, NULL, darr, len, proc,NULL);
410
int PARMCI_AccV( int op, /* oeration code */
411
void *scale, /*scaling factor for accumulate */
412
armci_giov_t darr[], /* descriptor array */
413
int len, /* length of descriptor array */
414
int proc /* remote process(or) ID */
417
int rc=0, i,direct=0;
419
if(len<1) return FAIL;
421
if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
422
if(darr[i].bytes<1)return FAIL3;
423
if(darr[i].ptr_array_len <1) return FAIL4;
426
if(proc<0 || proc >= armci_nproc)return FAIL5;
428
ORDER(op,proc); /* ensure ordering */
429
direct=SAMECLUSNODE(proc);
430
# if defined(ACC_COPY) && !defined(ACC_SMP)
431
if(armci_me != proc) direct=0;
435
rc = armci_acc_vector( op, scale, darr, len, proc);
437
DO_FENCE(proc,SERVER_PUT);
438
rc = armci_pack_vector(op, scale, darr, len, proc,NULL);
446
/*****************************************************************************/
448
/*\ Non-blocking vector API
450
int PARMCI_NbPutV( armci_giov_t darr[], /* descriptor array */
451
int len, /* length of descriptor array */
452
int proc, /* remote process(or) ID */
453
armci_hdl_t* usr_hdl /*non-blocking request handle*/
456
armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl;
457
int rc=0, i,direct=1;
459
if(len<1) return FAIL;
461
if(darr[i].src_ptr_array == NULL || darr[i].dst_ptr_array ==NULL) return FAIL2;
462
if(darr[i].bytes<1)return FAIL3;
463
if(darr[i].ptr_array_len <1) return FAIL4;
466
if(proc<0 || proc >= armci_nproc)return FAIL5;
468
direct=SAMECLUSNODE(proc);
470
if(nb_handle && nb_handle->agg_flag == SET) {
472
rc=armci_agg_save_giov_descriptor(darr, len, proc, PUT, nb_handle);
478
/*ORDER(PUT,proc); ensure ordering */
479
UPDATE_FENCE_INFO(proc);
481
/*set tag and op in the nb handle*/
483
nb_handle->tag = GET_NEXT_NBTAG();
485
nb_handle->proc= proc;
486
nb_handle->bufid=NB_NONE;
489
nb_handle = armci_set_implicit_handle(PUT, proc);
493
if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_PUT);
494
rc = armci_copy_vector(PUT, darr, len, proc);
497
DO_FENCE(proc,SERVER_NBPUT);
498
rc = armci_pack_vector(PUT, NULL, darr, len, proc,nb_handle);
505
int PARMCI_NbGetV( armci_giov_t darr[], /* descriptor array */
506
int len, /* length of descriptor array */
507
int proc, /* remote process(or) ID */
508
armci_hdl_t* usr_hdl /*non-blocking request handle*/
511
armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl;
512
int rc=0, i,direct=1;
514
if(len<1) return FAIL;
516
if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
517
if(darr[i].bytes<1)return FAIL3;
518
if(darr[i].ptr_array_len <1) return FAIL4;
521
if(proc<0 || proc >= armci_nproc)return FAIL5;
523
direct=SAMECLUSNODE(proc);
526
if(nb_handle && nb_handle->agg_flag == SET) {
528
rc=armci_agg_save_giov_descriptor(darr, len, proc, GET, nb_handle);
533
/* ORDER(GET,proc); ensure ordering */
535
nb_handle->tag = GET_NEXT_NBTAG();
537
nb_handle->proc= proc;
538
nb_handle->bufid=NB_NONE;
541
nb_handle = armci_set_implicit_handle(GET, proc);
545
if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_GET);
546
rc = armci_copy_vector(GET, darr, len, proc);
549
DO_FENCE(proc,SERVER_NBGET);
550
rc = armci_pack_vector(GET, NULL, darr, len, proc,nb_handle);
558
int PARMCI_NbAccV( int op, /* oeration code */
559
void *scale, /*scaling factor for accumulate */
560
armci_giov_t darr[], /* descriptor array */
561
int len, /* length of descriptor array */
562
int proc, /* remote process(or) ID */
563
armci_hdl_t* usr_hdl /*non-blocking request handle*/
566
armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl;
567
int rc=0, i,direct=1;
569
if(len<1) return FAIL;
572
if(darr[i].src_ptr_array==NULL ||darr[i].dst_ptr_array==NULL)return FAIL2;
573
if(darr[i].bytes<1)return FAIL3;
574
if(darr[i].ptr_array_len <1) return FAIL4;
577
if(proc<0 || proc >= armci_nproc)return FAIL5;
579
UPDATE_FENCE_INFO(proc);
580
direct=SAMECLUSNODE(proc);
583
nb_handle->tag = GET_NEXT_NBTAG();
585
nb_handle->proc= proc;
586
nb_handle->bufid=NB_NONE;
589
nb_handle = armci_set_implicit_handle(op, proc);
591
# if defined(ACC_COPY) && !defined(ACC_SMP)
592
if(armci_me != proc) direct=0;
596
rc = armci_acc_vector( op, scale, darr, len, proc);
598
DO_FENCE(proc,SERVER_NBPUT);
599
rc = armci_pack_vector(op, scale, darr, len, proc,nb_handle);
605
/*****************************************************************************/