15
#define DATA_SERVER_ 1
17
#ifdef ORNL_USE_DS_FOR_REMOTE_GETS
18
#define DATA_SERVER_GET_ 1
20
#define DATA_SERVER_GET_ 0
23
#define ARMCI_OP_2D(op, scale, proc, src, dst, bytes, count, src_stride, dst_stride,lockit)\
24
if(op == GET || op ==PUT)\
25
armci_copy_2D(op, proc, src, dst, bytes, count, src_stride,dst_stride);\
26
else if(count==1) armci_acc_1D(op, scale, proc, src, dst, bytes,lockit);\
28
armci_acc_2D(op, scale, proc, src, dst, bytes, count, src_stride,dst_stride,lockit)
30
/* macro supports run-time selection of request sending scheme */
31
#if defined(CLIENT_BUF_BYPASS)
32
#define CAN_REQUEST_DIRECTLY _armci_bypass
35
# define CAN_REQUEST_DIRECTLY 0
37
# define CAN_REQUEST_DIRECTLY 1
41
#define PREPROCESS_STRIDED(tmp_count) {\
44
for(;stride_levels;stride_levels--)if(count[stride_levels]>1)break;\
45
if(stride_levels&&(count[0]==src_stride_arr[0]&&count[0]==dst_stride_arr[0])){\
46
tmp_count=seg_count[1];\
48
seg_count[1] = seg_count[0] * seg_count[1];\
50
src_stride_arr ++; dst_stride_arr++ ;\
53
#define POSTPROCESS_STRIDED(tmp_count) if(tmp_count)seg_count[1]=tmp_count
56
#define SERVER_NBGET 2
58
#define DIRECT_NBGET 4
60
#define SERVER_NBPUT 6
62
#define DIRECT_NBPUT 8
65
# define DO_FENCE(__proc,__prot) if(__prot==SERVER_GET);\
66
else if(__prot==SERVER_PUT);\
67
else if(__prot==DIRECT_GET || __prot==DIRECT_NBGET){\
68
if(armci_prot_switch_fence[__proc]==SERVER_PUT)\
69
ARMCI_DoFence(__proc);\
71
else if(__prot==DIRECT_PUT || __prot==DIRECT_NBPUT){\
72
if(armci_prot_switch_fence[__proc]==SERVER_PUT)\
73
ARMCI_DoFence(__proc);\
76
armci_prot_switch_fence[__proc]=__prot
78
#ifndef REGIONS_REQUIRE_MEMHDL
79
# define ARMCI_MEMHDL_T void
82
ARMCI_MEMHDL_T *mhloc=NULL,*mhrem=NULL;
84
#ifdef REGIONS_REQUIRE_MEMHDL
85
int armci_region_both_found_hndl(void *loc, void *rem, int size, int node,
86
ARMCI_MEMHDL_T **loc_memhdl,ARMCI_MEMHDL_T **rem_memhdl);
87
# define ARMCI_REGION_BOTH_FOUND(_s,_d,_b,_p) \
88
armci_region_both_found_hndl((_s),(_d),(_b),(_p),&mhloc,&mhrem)
90
# define ARMCI_REGION_BOTH_FOUND(_s,_d,_b,_p) \
91
armci_region_both_found((_s),(_d),(_b),(_p))
96
# ifdef REGIONS_REQUIRE_MEMHDL
97
void armci_client_direct_get(int p, void *src_buf, void *dst_buf, int len,
98
void** cptr,int nbtag,ARMCI_MEMHDL_T *lochdl,ARMCI_MEMHDL_T *remhdl);
100
void armci_client_direct_get(int p, void *src_buf, void *dst_buf, int len,
101
void** contextptr,int nbtag,void *mhdl,void *mhdl1);
103
# define ARMCI_NBREM_GET(_p,_s,_sst,_d,_dst,_cou,_lev,_hdl) \
104
armci_client_direct_get((_p),(_s),(_d),(_cou)[0],&((_hdl)->cmpl_info),(_hdl)->tag,(void *)mhloc,(void *)mhrem); \
106
# define ARMCI_REM_GET(_p,_s,_sst,_d,_dst,_cou,_lev,_hdl) \
107
armci_client_direct_get((_p),(_s),(_d),(_cou)[0],NULL,0,(void *)mhloc,(void *)mhrem); \
111
# define ARMCI_REM_GET(_p,_s,_sst,_d,_dst,_cou,_lev,_hdl) \
112
armci_rem_get((_p),(_s),(_sst),(_d),(_dst),(_cou),(_lev),(_hdl),(void *)mhloc,(void *)mhrem)
113
# define ARMCI_NBREM_GET ARMCI_REM_GET
117
extern int* armci_prot_switch_fence;
118
extern int armci_prot_switch_preproc;
119
extern int armci_prot_switch_preop;
122
int armci_iwork[MAX_STRIDE_LEVEL];
124
/*\ 2-dimensional array copy
126
static void armci_copy_2D(int op, int proc, void *src_ptr, void *dst_ptr,
127
int bytes, int count, int src_stride, int dst_stride)
129
int armci_th_idx = ARMCI_THREAD_IDX;
140
int shmem = SAMECLUSNODE(proc);
143
/* data is in local/shared memory -- can use memcpy */
144
// printf("%s: shmem==true; count==%d\n",Portals_ID(),count);
146
armci_copy(src_ptr, dst_ptr, bytes);
147
// printf("%s: shmem==true; finished\n",Portals_ID(),count);
149
char *ps=(char*)src_ptr;
150
char *pd=(char*)dst_ptr;
152
for (j = 0; j < count; j++){
160
/* data not in local/shared memory-access through global address space*/
164
printf("%s: pre UPDATE_FENCE_STATE\n",Portals_ID());
165
UPDATE_FENCE_STATE(proc, PUT, COUNT);
166
printf("%s: post UPDATE_FENCE_STATE\n",Portals_ID());
168
SET_COUNTER(ack_cntr[armci_th_idx],COUNT);
171
armci_put(src_ptr, dst_ptr, bytes, proc);
173
armci_put2D(proc, bytes, count, src_ptr, src_stride,
174
dst_ptr, dst_stride);
180
SET_COUNTER(get_cntr[armci_th_idx], COUNT);
183
armci_get(src_ptr, dst_ptr, bytes, proc);
185
armci_get2D(proc, bytes, count, src_ptr, src_stride,
186
dst_ptr, dst_stride);
193
#if (defined(CRAY) && !defined(__crayx1)) || defined(FUJITSU)
197
# define DAXPY daxpy_
201
#define THRESH_ACC 32
203
static void daxpy_2d_(void* alpha, int *rows, int *cols, void *a, int *ald,
207
double *A = (double*)a;
208
double *B = (double*)b;
209
double Alpha = *(double*)alpha;
211
if(*rows < THRESH_ACC)
214
A[c* *ald+ r] += Alpha * B[c* *bld+r];
215
else for(c=0;c<*cols;c++)
216
DAXPY(rows, alpha, B + c* *bld, &ONE, A + c* *ald, &ONE);
221
void armci_acc_1D(int op, void *scale, int proc, void *src, void *dst, int bytes, int lockit)
224
void (ATR *func)(void*, void*, void*, int*);
225
ARMCI_PR_DBG("enter",0);
228
rows = bytes/sizeof(int);
229
func = I_ACCUMULATE_1D;
232
rows = bytes/sizeof(long);
233
func = L_ACCUMULATE_1D;
236
rows = bytes/sizeof(double);
237
func = D_ACCUMULATE_1D;
240
rows = bytes/(2*sizeof(double));
241
func = Z_ACCUMULATE_1D;
244
rows = bytes/(2*sizeof(float));
245
func = C_ACCUMULATE_1D;
248
rows = bytes/sizeof(float);
249
func = F_ACCUMULATE_1D;
251
default: armci_die("ARMCI accumulate: operation not supported",op);
252
func = F_ACCUMULATE_1D; /*avoid compiler whining */
257
ARMCI_LOCKMEM(dst, bytes + (char*)dst, proc);
259
func(scale, dst, src, &rows);
260
if(lockit)ARMCI_UNLOCKMEM(proc);
261
ARMCI_PR_DBG("exit",0);
264
/*\ 2-dimensional accumulate
266
void armci_acc_2D(int op, void* scale, int proc, void *src_ptr, void *dst_ptr,
267
int bytes, int cols, int src_stride, int dst_stride, int lockit)
269
int rows, lds, ldd, span;
270
void (ATR *func)(void*, int*, int*, void*, int*, void*, int*);
272
ARMCI_PR_DBG("enter",0);
275
if((long)src_ptr%ALIGN)armci_die("src not aligned",(long)src_ptr);
276
if((long)dst_ptr%ALIGN)armci_die("src not aligned",(long)dst_ptr);
281
rows = bytes/sizeof(int);
282
ldd = dst_stride/sizeof(int);
283
lds = src_stride/sizeof(int);
284
func = I_ACCUMULATE_2D;
287
rows = bytes/sizeof(long);
288
ldd = dst_stride/sizeof(long);
289
lds = src_stride/sizeof(long);
290
func = L_ACCUMULATE_2D;
293
rows = bytes/sizeof(double);
294
ldd = dst_stride/sizeof(double);
295
lds = src_stride/sizeof(double);
296
func = D_ACCUMULATE_2D;
299
rows = bytes/(2*sizeof(double));
300
ldd = dst_stride/(2*sizeof(double));
301
lds = src_stride/(2*sizeof(double));
302
func = Z_ACCUMULATE_2D;
305
rows = bytes/(2*sizeof(float));
306
ldd = dst_stride/(2*sizeof(float));
307
lds = src_stride/(2*sizeof(float));
308
func = C_ACCUMULATE_2D;
311
rows = bytes/sizeof(float);
312
ldd = dst_stride/sizeof(float);
313
lds = src_stride/sizeof(float);
314
func = F_ACCUMULATE_2D;
317
rows = bytes/sizeof(long);
318
ldd = dst_stride/sizeof(long);
319
lds = src_stride/sizeof(long);
320
func = RA_ACCUMULATE_2D;
322
default: armci_die("ARMCI accumulate: operation not supported",op);
323
func = F_ACCUMULATE_2D; /*avoid compiler whining */
328
span = cols*dst_stride;
329
ARMCI_LOCKMEM(dst_ptr, span + (char*)dst_ptr, proc);
331
func(scale, &rows, &cols, dst_ptr, &ldd, src_ptr, &lds);
332
if(lockit)ARMCI_UNLOCKMEM(proc);
333
ARMCI_PR_DBG("exit",0);
338
/*\ compute range of strided data AND lock it
341
armci_lockmem_patch(void* dst_ptr, int dst_stride_arr[], int count[], int stride_levels, int proc)
343
long span = count[stride_levels];
344
ARMCI_PR_DBG("enter",0);
345
span *= dst_stride_arr[stride_levels-1];
347
/* lock region of remote memory */
348
ARMCI_LOCKMEM(dst_ptr, span + (char*)dst_ptr, proc);
349
ARMCI_PR_DBG("exit",0);
353
/*\ strided accumulate on top of remote memory copy:
354
* copies remote data to local buffer, accumulates, puts it back
355
* Note: if we are here then remote patch must fit in the ARMCI buffer
357
int armci_acc_copy_strided(int optype, void* scale, int proc,
358
void* src_ptr, int src_stride_arr[],
359
void* dst_ptr, int dst_stride_arr[],
360
int count[], int stride_levels)
362
void *buf_ptr = armci_internal_buffer;
363
int rc, i, *buf_stride_arr = armci_iwork;
364
ARMCI_PR_DBG("enter",0);
365
armci_lockmem_patch(dst_ptr,dst_stride_arr, count, stride_levels, proc);
367
/* setup stride array for internal buffer */
368
buf_stride_arr[0]=count[0];
369
for(i=0; i< stride_levels; i++) {
370
buf_stride_arr[i+1]= buf_stride_arr[i]*count[i+1];
373
/* get remote data to local buffer */
374
rc = armci_op_strided(GET, scale, proc, dst_ptr, dst_stride_arr, buf_ptr,
375
buf_stride_arr, count, stride_levels, 0,NULL);
377
if(rc) { ARMCI_UNLOCKMEM(proc); return(rc); }
379
/* call local accumulate with lockit=0 (we locked it already) and proc=me */
380
rc = armci_op_strided(optype, scale, armci_me, src_ptr, src_stride_arr,
381
buf_ptr,buf_stride_arr, count, stride_levels,0,NULL);
382
if(rc) { ARMCI_UNLOCKMEM(proc); return(rc); }
384
/* put data back from the buffer to remote location */
385
rc = armci_op_strided(PUT, scale, proc, buf_ptr, buf_stride_arr, dst_ptr,
386
dst_stride_arr, count, stride_levels,0,NULL);
388
FENCE_NODE(proc); /* make sure put completes before unlocking */
389
ARMCI_UNLOCKMEM(proc); /* release memory lock */
390
ARMCI_PR_DBG("exit",0);
397
/*\ Strided operation
399
int armci_op_strided(int op, void* scale, int proc,void *src_ptr,
400
int src_stride_arr[], void* dst_ptr, int dst_stride_arr[],
401
int count[], int stride_levels, int lockit,
402
armci_ihdl_t nb_handle)
404
char *src = (char*)src_ptr, *dst=(char*)dst_ptr;
405
int s2, s3, i,j, unlockit=0;
407
int index[MAX_STRIDE_LEVEL], unit[MAX_STRIDE_LEVEL];
408
ARMCI_PR_DBG("enter",op);
409
# if defined(ACC_COPY)
412
if(ARMCI_ACC(op) && !(SAMECLUSNODE(proc)) )
414
if ( ARMCI_ACC(op) && proc!=armci_me)
416
/* copy remote data, accumulate, copy back*/
417
return (armci_acc_copy_strided(op,scale, proc, src_ptr, src_stride_arr,
418
dst_ptr, dst_stride_arr, count, stride_levels));
420
else; /* do it directly through shared/local memory */
424
if(ARMCI_ACC(op) && (stride_levels>2) && lockit){
425
/* we need one lock operation only - must be done outside 2d acc */
426
armci_lockmem_patch(dst_ptr,dst_stride_arr, count, stride_levels, proc);
430
/* if(proc!=armci_me) INTR_OFF;*/
431
if(armci_me>=0 && !SAMECLUSNODE(proc)) {
432
printf("%s network_strided not supported (in op_strided)\n",Portals_ID());
434
armci_network_strided(op,scale,proc,src_ptr,src_stride_arr,dst_ptr,
435
dst_stride_arr,count,stride_levels,nb_handle);
438
// printf("%s in large switch stmt in op_strided (stride_levels=%d)\n",Portals_ID(),stride_levels);
439
switch (stride_levels) {
440
case 0: /* 1D copy */
442
ARMCI_OP_2D(op, scale, proc, src_ptr, dst_ptr, count[0], 1,
443
count[0], count[0], lockit);
448
ARMCI_OP_2D(op, scale, proc, src_ptr, dst_ptr, count[0], count[1],
449
src_stride_arr[0], dst_stride_arr[0], lockit);
453
for (s2= 0; s2 < count[2]; s2++){ /* 2D copy */
454
ARMCI_OP_2D(op, scale, proc, src+s2*src_stride_arr[1],
455
dst+s2*dst_stride_arr[1], count[0], count[1],
456
src_stride_arr[0], dst_stride_arr[0], lockit );
461
for(s3=0; s3< count[3]; s3++){
462
src = (char*)src_ptr + src_stride_arr[2]*s3;
463
dst = (char*)dst_ptr + dst_stride_arr[2]*s3;
464
for (s2= 0; s2 < count[2]; s2++){ /* 3D copy */
465
ARMCI_OP_2D(op, scale, proc, src+s2*src_stride_arr[1],
466
dst+s2*dst_stride_arr[1],
467
count[0], count[1],src_stride_arr[0],
468
dst_stride_arr[0],lockit);
473
default: /* N-dimensional */
475
/* stride_levels is not the same as ndim. it is ndim-1
476
* For example a 10x10x10... array, suppose the datatype is byte
477
* the stride_arr is 10, 10x10, 10x10x10 ....
479
index[2] = 0; unit[2] = 1; total_of_2D = count[2];
480
for(j=3; j<=stride_levels; j++) {
481
index[j] = 0; unit[j] = unit[j-1] * count[j-1];
482
total_of_2D *= count[j];
485
for(i=0; i<total_of_2D; i++) {
486
src = (char *)src_ptr; dst = (char *)dst_ptr;
487
for(j=2; j<=stride_levels; j++) {
488
src += index[j] * src_stride_arr[j-1];
489
dst += index[j] * dst_stride_arr[j-1];
491
if(((i+1) % unit[j]) == 0) index[j]++;
492
if(index[j] >= count[j]) index[j] = 0;
495
ARMCI_OP_2D(op, scale, proc, src, dst, count[0], count[1],
496
src_stride_arr[0], dst_stride_arr[0], lockit);
503
// printf("%s after switch stmt; prior to fence/lock\n",Portals_ID());
506
# if defined(ACC_COPY)
509
ARMCI_UNLOCKMEM(proc); /* release memory lock */
512
// printf("%s after fence/lock; leaving op_strided\n",Portals_ID());
513
ARMCI_PR_DBG("exit",op);
518
int PARMCI_PutS( void *src_ptr, /* pointer to 1st segment at source*/
519
int src_stride_arr[], /* array of strides at source */
520
void* dst_ptr, /* pointer to 1st segment at destination*/
521
int dst_stride_arr[], /* array of strides at destination */
522
int seg_count[], /* number of segments at each stride
523
levels: count[0]=bytes*/
524
int stride_levels, /* number of stride levels */
525
int proc /* remote process(or) ID */
529
int *count=seg_count, tmp_count=0;
531
ARMCI_PR_DBG("enter",proc);
532
if(src_ptr == NULL || dst_ptr == NULL) return FAIL;
533
if(count[0]<0)return FAIL3;
534
if(stride_levels <0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4;
535
if(proc<0)return FAIL5;
537
ORDER(PUT,proc); /* ensure ordering */
538
PREPROCESS_STRIDED(tmp_count);
541
if(stride_levels) direct=SAMECLUSNODE(proc);
542
direct=SAMECLUSNODE(proc);
545
// printf("%s direct=%d, proc=%d\n",Portals_ID(),direct,proc);
548
DO_FENCE(proc,SERVER_PUT);
549
// printf("%s calling pack_strided in PARMCI_PutS\n",Portals_ID());
550
rc = armci_pack_strided(PUT, NULL, proc, src_ptr, src_stride_arr,dst_ptr,
551
dst_stride_arr, count, stride_levels, NULL, -1, -1, -1,NULL);
555
if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_PUT);
556
// printf("%s calling op_strided in PARMCI_PutS\n",Portals_ID());
557
rc = armci_op_strided( PUT, NULL, proc, src_ptr, src_stride_arr,
558
dst_ptr, dst_stride_arr,count,stride_levels,
561
POSTPROCESS_STRIDED(tmp_count);
563
ARMCI_PR_DBG("exit",proc);
569
int PARMCI_PutS_flag(
570
void* src_ptr, /* pointer to 1st segment at source */
571
int src_stride_arr[], /* array of strides at source */
572
void* dst_ptr, /* pointer to 1st segment at destination */
573
int dst_stride_arr[], /* array of strides at destination */
574
int count[], /* number of units at each stride level,
576
int stride_levels, /* number of stride levels */
577
int *flag, /* pointer to remote flag */
578
int val, /* value to set flag upon completion of
580
int proc /* remote process(or) ID */
584
/* Put local data on remote processor */
585
PARMCI_PutS(src_ptr, src_stride_arr, dst_ptr, dst_stride_arr,
586
count, stride_levels, proc);
588
/* Send signal to remote processor that data transfer has
591
ARMCI_Put(&val, flag, bytes, proc);
596
int PARMCI_Put_flag(void *src, void* dst,int bytes,int *f,int v,int proc) {
597
return PARMCI_PutS_flag(src, NULL, dst, NULL, &bytes, 0, f, v, proc);
601
int PARMCI_PutS_flag_dir(void *src_ptr, int src_stride_arr[],
602
void* dst_ptr, int dst_stride_arr[],
603
int seg_count[], int stride_levels,
604
int *flag, int val, int proc) {
605
return PARMCI_PutS_flag(src_ptr, src_stride_arr,dst_ptr,dst_stride_arr,
606
seg_count, stride_levels, flag, val, proc);
610
int PARMCI_GetS( void *src_ptr, /* pointer to 1st segment at source*/
611
int src_stride_arr[], /* array of strides at source */
612
void* dst_ptr, /* 1st segment at destination*/
613
int dst_stride_arr[], /* array of strides at destination */
614
int seg_count[], /* number of segments at each stride
615
levels: count[0]=bytes*/
616
int stride_levels, /* number of stride levels */
617
int proc /* remote process(or) ID */
621
int *count=seg_count, tmp_count=0;
622
ARMCI_PR_DBG("enter",proc);
624
if(src_ptr == NULL || dst_ptr == NULL) return FAIL;
625
if(seg_count[0]<0)return FAIL3;
626
if(stride_levels <0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4;
627
if(proc<0||proc>=armci_nproc){printf("\n%d:%s:proc=%d",armci_me,FUNCTION_NAME,proc);fflush(stdout);return FAIL5;}
629
ORDER(GET,proc); /* ensure ordering */
630
PREPROCESS_STRIDED(tmp_count);
633
if(stride_levels)direct=SAMECLUSNODE(proc);
634
direct=SAMECLUSNODE(proc);
637
DO_FENCE(proc,SERVER_GET);
638
rc = armci_pack_strided(GET, NULL, proc, src_ptr, src_stride_arr,
639
dst_ptr,dst_stride_arr,count,stride_levels,
643
if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_GET);
644
rc = armci_op_strided(GET, NULL, proc, src_ptr, src_stride_arr, dst_ptr,
645
dst_stride_arr,count, stride_levels,0,NULL);
648
POSTPROCESS_STRIDED(tmp_count);
649
ARMCI_PR_DBG("exit",proc);
657
int PARMCI_AccS( int optype, /* operation */
658
void *scale, /* scale factor x += scale*y */
659
void *src_ptr, /* pointer to 1st segment at source*/
660
int src_stride_arr[], /* array of strides at source */
661
void* dst_ptr, /* 1st segment at destination*/
662
int dst_stride_arr[], /* array of strides at destination */
663
int seg_count[], /* number of segments at each stride
664
levels: count[0]=bytes*/
665
int stride_levels, /* number of stride levels */
666
int proc /* remote process(or) ID */
670
int *count=seg_count, tmp_count=0;
672
ARMCI_PR_DBG("enter",proc);
673
if(src_ptr == NULL || dst_ptr == NULL) return FAIL;
674
if(src_stride_arr == NULL || dst_stride_arr ==NULL) return FAIL2;
675
if(count[0]<0)return FAIL3;
676
if(stride_levels <0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4;
677
if(proc<0)return FAIL5;
679
ORDER(optype,proc); /* ensure ordering */
680
PREPROCESS_STRIDED(tmp_count);
683
direct=SAMECLUSNODE(proc);
686
# if defined(ACC_COPY) && !defined(ACC_SMP)
687
if(armci_me != proc) direct=0;
690
rc = armci_op_strided(optype,scale, proc, src_ptr, src_stride_arr,dst_ptr,
691
dst_stride_arr, count, stride_levels,1,NULL);
693
DO_FENCE(proc,SERVER_PUT);
694
rc = armci_pack_strided(optype,scale,proc,src_ptr, src_stride_arr,dst_ptr,
695
dst_stride_arr,count,stride_levels,NULL,-1,-1,-1,NULL);
697
POSTPROCESS_STRIDED(tmp_count);
698
ARMCI_PR_DBG("exit",proc);
705
whatever original put and get functions were here have been
706
replaced with the proper ones from the main armci branch.
707
the old functions were entirely responsible for causing the
708
test_vector_acc test to fail in test.x
711
int PARMCI_Put(void *src, void* dst, int bytes, int proc) {
713
//ARMCI_PROFILE_START_STRIDED(&bytes, 0, proc, ARMCI_PROF_PUT);
714
rc = PARMCI_PutS(src, NULL, dst, NULL, &bytes, 0, proc);
715
//ARMCI_PROFILE_STOP_STRIDED(ARMCI_PROF_PUT);
720
int PARMCI_Get(void *src, void* dst, int bytes, int proc) {
722
//ARMCI_PROFILE_START_STRIDED(&bytes, 0, proc, ARMCI_PROF_GET);
725
memcpy(dst,src,bytes);
727
rc = PARMCI_GetS(src, NULL, dst, NULL, &bytes, 0, proc);
729
//ARMCI_PROFILE_STOP_STRIDED(ARMCI_PROF_GET);
735
int PARMCI_Acc(int optype, void *scale, void *src, void* dst, int bytes, int proc) {
737
rc = PARMCI_AccS(optype, scale, src, NULL, dst, NULL, &bytes, 0, proc);
744
# define armci_read_strided1 armci_read_strided
745
# define armci_write_strided1 armci_write_strided
747
# define armci_read_strided2 armci_read_strided
748
# define armci_write_strided2 armci_write_strided
751
void armci_write_strided1(void *ptr, int stride_levels, int stride_arr[],
752
int count[], char *buf)
755
long idx; /* index offset of current block position to ptr */
756
int n1dim; /* number of 1 dim block */
757
int bvalue[MAX_STRIDE_LEVEL], bunit[MAX_STRIDE_LEVEL];
758
int bytes = count[0];
759
ARMCI_PR_DBG("enter",stride_levels);
761
/* number of n-element of the first dimension */
763
for(i=1; i<=stride_levels; i++)
766
/* calculate the destination indices */
767
bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
768
for(i=2; i<=stride_levels; i++) {
770
bunit[i] = bunit[i-1] * count[i-1];
773
for(i=0; i<n1dim; i++) {
775
for(j=1; j<=stride_levels; j++) {
776
idx += bvalue[j] * stride_arr[j-1];
777
if((i+1) % bunit[j] == 0) bvalue[j]++;
778
if(bvalue[j] > (count[j]-1)) bvalue[j] = 0;
781
armci_copy( ((char*)ptr)+idx, buf, bytes);
784
ARMCI_PR_DBG("exit",stride_levels);
788
void armci_write_strided2(void *ptr, int stride_levels, int stride_arr[],
789
int count[], char *buf)
792
int total; /* number of 2 dim block */
793
int index[MAX_STRIDE_LEVEL], unit[MAX_STRIDE_LEVEL];
794
ARMCI_PR_DBG("enter",stride_levels);
796
if(stride_levels == 0){
797
armci_copy( ptr, buf, count[0]);
798
}else if (count[0]%ALIGN_SIZE || (unsigned long)ptr%ALIGN_SIZE )
799
armci_write_strided1(ptr,stride_levels, stride_arr,count,buf);
801
int rows, ld, idx, ldd;
804
ld = stride_arr[0]/8;
805
switch(stride_levels){
807
DCOPY21(&rows, count+1, ptr, &ld, buf, &idx);
810
ldd = stride_arr[1]/stride_arr[0];
811
DCOPY31(&rows, count+1, count+2, ptr, &ld, &ldd, buf,&idx);
815
index[2] = 0; unit[2] = 1; total = count[2];
816
for(j=3; j<=stride_levels; j++) {
817
index[j] = 0; unit[j] = unit[j-1] * count[j-1];
820
for(i=0; i<total; i++) {
822
for(j=2; j<=stride_levels; j++) {
823
src += index[j] * stride_arr[j-1];
824
if(((i+1) % unit[j]) == 0) index[j]++;
825
if(index[j] >= count[j]) index[j] = 0;
827
DCOPY21(&rows, count+1,src, &ld, buf, &idx);
828
buf = (char*) ((double*)buf + idx);
832
ARMCI_PR_DBG("exit",stride_levels);
836
void armci_read_strided1(void *ptr, int stride_levels, int stride_arr[],
837
int count[], char *buf)
840
long idx; /* index offset of current block position to ptr */
841
int n1dim; /* number of 1 dim block */
842
int bvalue[MAX_STRIDE_LEVEL], bunit[MAX_STRIDE_LEVEL];
843
int bytes = count[0];
845
ARMCI_PR_DBG("enter",stride_levels);
846
/* number of n-element of the first dimension */
848
for(i=1; i<=stride_levels; i++)
851
/* calculate the destination indices */
852
bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
853
for(i=2; i<=stride_levels; i++) {
855
bunit[i] = bunit[i-1] * count[i-1];
858
for(i=0; i<n1dim; i++) {
860
for(j=1; j<=stride_levels; j++) {
861
idx += bvalue[j] * stride_arr[j-1];
862
if((i+1) % bunit[j] == 0) bvalue[j]++;
863
if(bvalue[j] > (count[j]-1)) bvalue[j] = 0;
866
armci_copy(buf, ((char*)ptr)+idx,bytes);
869
ARMCI_PR_DBG("exit",stride_levels);
873
void armci_read_strided2(void *ptr, int stride_levels, int stride_arr[],
874
int count[], char *buf)
877
int total; /* number of 2 dim block */
878
int index[MAX_STRIDE_LEVEL], unit[MAX_STRIDE_LEVEL];
880
ARMCI_PR_DBG("enter",stride_levels);
881
if(stride_levels == 0){
882
armci_copy( buf, ptr, count[0]);
883
}else if (count[0]%ALIGN_SIZE || (unsigned long)ptr%ALIGN_SIZE)
884
armci_read_strided1(ptr,stride_levels, stride_arr,count,buf);
886
int rows, ld, idx, ldd;
889
ld = stride_arr[0]/8;
890
switch(stride_levels){
892
DCOPY12(&rows, count+1, ptr, &ld, buf, &idx);
895
ldd = stride_arr[1]/stride_arr[0];
896
DCOPY13(&rows, count+1, count+2, ptr, &ld, &ldd, buf,&idx);
899
index[2] = 0; unit[2] = 1; total = count[2];
900
for(j=3; j<=stride_levels; j++) {
901
index[j] = 0; unit[j] = unit[j-1] * count[j-1];
904
for(i=0; i<total; i++) {
906
for(j=2; j<=stride_levels; j++) {
907
src += index[j] * stride_arr[j-1];
908
if(((i+1) % unit[j]) == 0) index[j]++;
909
if(index[j] >= count[j]) index[j] = 0;
911
DCOPY12(&rows, count+1,src, &ld, buf, &idx);
912
buf = (char*) ((double*)buf + idx);
916
ARMCI_PR_DBG("exit",stride_levels);
921
int PARMCI_NbPutS( void *src_ptr, /* pointer to 1st segment at source*/
922
int src_stride_arr[], /* array of strides at source */
923
void* dst_ptr, /* pointer to 1st segment at destination*/
924
int dst_stride_arr[], /* array of strides at destination */
925
int seg_count[], /* number of segments at each stride
926
levels: count[0]=bytes*/
927
int stride_levels, /* number of stride levels */
928
int proc, /* remote process(or) ID */
929
armci_hdl_t* usr_hdl /* armci non-blocking call handle*/
932
armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl;
933
int *count=seg_count, tmp_count=0;
935
ARMCI_PR_DBG("enter",proc);
936
if(src_ptr == NULL || dst_ptr == NULL) return FAIL;
937
if(count[0]<0)return FAIL3;
938
if(stride_levels <0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4;
939
if(proc<0)return FAIL5;
941
PREPROCESS_STRIDED(tmp_count);
944
if(stride_levels)direct=SAMECLUSNODE(proc);
945
direct=SAMECLUSNODE(proc);
949
if(nb_handle && nb_handle->agg_flag == SET) {
951
rc= armci_agg_save_strided_descriptor(src_ptr, src_stride_arr,
952
dst_ptr, dst_stride_arr,
953
count, stride_levels, proc,
955
POSTPROCESS_STRIDED(tmp_count);
960
UPDATE_FENCE_INFO(proc);
962
/*set tag and op in the nb handle*/
964
nb_handle->tag = GET_NEXT_NBTAG();
966
nb_handle->proc= proc;
967
nb_handle->bufid=NB_NONE;
970
nb_handle = armci_set_implicit_handle(PUT, proc);
974
DO_FENCE(proc,SERVER_NBPUT);
975
rc = armci_pack_strided(PUT, NULL, proc, src_ptr, src_stride_arr,dst_ptr,
976
dst_stride_arr, count, stride_levels,NULL,-1,-1,-1,nb_handle);
979
if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_NBPUT);
980
rc = armci_op_strided( PUT, NULL, proc, src_ptr, src_stride_arr,
981
dst_ptr,dst_stride_arr,count,stride_levels, 0,nb_handle);
984
POSTPROCESS_STRIDED(tmp_count);
985
ARMCI_PR_DBG("exit",proc);
990
int PARMCI_NbGetS( void *src_ptr, /* pointer to 1st segment at source*/
991
int src_stride_arr[], /* array of strides at source */
992
void* dst_ptr, /* 1st segment at destination*/
993
int dst_stride_arr[], /* array of strides at destination */
994
int seg_count[], /* number of segments at each stride
995
levels: byte_count[0]=bytes*/
996
int stride_levels, /* number of stride levels */
997
int proc, /* remote process(or) ID */
998
armci_hdl_t* usr_hdl /* armci non-blocking call handle*/
1001
armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl;
1003
int *count=seg_count, tmp_count=0;
1005
ARMCI_PR_DBG("enter",proc);
1006
if(src_ptr == NULL || dst_ptr == NULL) return FAIL;
1007
if(seg_count[0]<0)return FAIL3;
1008
if(stride_levels <0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4;
1009
if(proc<0)return FAIL5;
1011
#if DATA_SERVER_GET_
1012
if(stride_levels)direct=SAMECLUSNODE(proc);
1013
direct=SAMECLUSNODE(proc);
1016
PREPROCESS_STRIDED(tmp_count);
1018
if(nb_handle && nb_handle->agg_flag == SET) {
1020
rc= armci_agg_save_strided_descriptor(src_ptr, src_stride_arr,
1021
dst_ptr, dst_stride_arr,
1022
count, stride_levels, proc,
1024
POSTPROCESS_STRIDED(tmp_count);
1029
/* ORDER(GET,proc); ensure ordering */
1030
/*set tag and op in the nb handle*/
1032
nb_handle->tag = GET_NEXT_NBTAG();
1033
nb_handle->op = GET;
1034
nb_handle->proc= proc;
1035
nb_handle->bufid=NB_NONE;
1038
nb_handle = armci_set_implicit_handle(GET, proc);
1042
DO_FENCE(proc,SERVER_NBGET);
1043
rc = armci_pack_strided(GET, NULL, proc, src_ptr, src_stride_arr,
1044
dst_ptr,dst_stride_arr,count,stride_levels,
1045
NULL,-1,-1,-1,nb_handle);
1048
if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_GET);
1049
rc = armci_op_strided(GET, NULL, proc, src_ptr, src_stride_arr, dst_ptr,
1050
dst_stride_arr,count, stride_levels,0,nb_handle);
1053
POSTPROCESS_STRIDED(tmp_count);
1055
ARMCI_PR_DBG("exit",proc);
1056
if(rc) return FAIL6;
1061
int PARMCI_NbAccS( int optype, /* operation */
1062
void *scale, /* scale factor x += scale*y */
1063
void *src_ptr, /* pointer to 1st segment at source*/
1064
int src_stride_arr[], /* array of strides at source */
1065
void* dst_ptr, /* 1st segment at destination*/
1066
int dst_stride_arr[], /* array of strides at destination */
1067
int seg_count[], /* number of segments at each stride
1068
levels: count[0]=bytes*/
1069
int stride_levels, /* number of stride levels */
1070
int proc, /* remote process(or) ID */
1071
armci_hdl_t* usr_hdl /* armci non-blocking call handle*/
1074
armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl;
1075
int *count=seg_count, tmp_count=0;
1078
ARMCI_PR_DBG("enter",proc);
1079
if(src_ptr == NULL || dst_ptr == NULL) return FAIL;
1080
if(src_stride_arr == NULL || dst_stride_arr ==NULL) return FAIL2;
1081
if(count[0]<0)return FAIL3;
1082
if(stride_levels <0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4;
1083
if(proc<0)return FAIL5;
1085
UPDATE_FENCE_INFO(proc);
1086
PREPROCESS_STRIDED(tmp_count);
1089
direct=SAMECLUSNODE(proc);
1092
# if defined(ACC_COPY) && !defined(ACC_SMP)
1093
if(armci_me != proc) direct=0;
1096
/*set tag and op in the nb handle*/
1098
nb_handle->tag = GET_NEXT_NBTAG();
1099
nb_handle->op = optype;
1100
nb_handle->proc= proc;
1101
nb_handle->bufid=NB_NONE;
1104
nb_handle = armci_set_implicit_handle(optype, proc);
1108
rc = armci_op_strided(optype,scale, proc, src_ptr, src_stride_arr,dst_ptr,
1109
dst_stride_arr, count, stride_levels,1,NULL);
1112
DO_FENCE(proc,SERVER_NBPUT);
1113
rc = armci_pack_strided(optype,scale,proc,src_ptr, src_stride_arr,dst_ptr,
1114
dst_stride_arr,count,stride_levels,NULL,-1,-1,-1,nb_handle);
1117
POSTPROCESS_STRIDED(tmp_count);
1119
ARMCI_PR_DBG("exit",proc);
1120
if(rc) return FAIL6;
1125
#if !defined(ACC_COPY)&&!defined(CRAY_YMP)&&!defined(CYGNUS)&&!defined(CYGWIN) &&!defined(BGML)
1129
#define INIT_NB_HANDLE(nb,o,p) if(nb){\
1131
(nb)->op = (o); (nb)->proc= (p);\
1132
(nb)->bufid=NB_NONE;}\
1133
else { (nb)=armci_set_implicit_handle(o, p); (nb)->tag=0; }
1135
void set_nbhandle(armci_ihdl_t *nbh, armci_hdl_t *nb_handle, int op,
1140
*nbh=(armci_ihdl_t)nb_handle;
1144
*nbh=armci_set_implicit_handle(op, proc);
1149
int PARMCI_NbPut(void *src, void* dst, int bytes, int proc,armci_hdl_t* uhandle)
1153
armci_ihdl_t nb_handle = (armci_ihdl_t)uhandle;
1154
ARMCI_PR_DBG("enter",proc);
1156
if(src == NULL || dst == NULL) return FAIL;
1158
direct =SAMECLUSNODE(proc);
1161
if(nb_handle && nb_handle->agg_flag == SET) {
1162
if(direct) { armci_copy(src,dst,bytes); rc=0; }
1164
rc=armci_agg_save_descriptor(src,dst,bytes,proc,PUT,0,nb_handle);
1169
/*armci_wait needs proc to compute direct*/
1170
INIT_NB_HANDLE(nb_handle,PUT,proc);
1171
armci_copy(src,dst,bytes);
1175
rc=PARMCI_NbPutS(src, NULL,dst,NULL, &bytes,0,proc,uhandle);
1177
# ifdef ARMCI_NB_PUT
1178
INIT_NB_HANDLE(nb_handle,PUT,proc);
1179
UPDATE_FENCE_STATE(proc, PUT, 1);
1180
ARMCI_NB_PUT(src, dst, bytes, proc, &nb_handle->cmpl_info);
1182
rc=PARMCI_NbPutS(src, NULL,dst,NULL, &bytes,0,proc,uhandle);
1187
ARMCI_PR_DBG("exit",proc);
1192
int PARMCI_NbGet(void *src, void* dst, int bytes, int proc,armci_hdl_t* uhandle)
1196
armci_ihdl_t nb_handle = (armci_ihdl_t)uhandle;
1197
ARMCI_PR_DBG("enter",proc);
1199
if(src == NULL || dst == NULL) return FAIL;
1201
direct =SAMECLUSNODE(proc);
1203
if(nb_handle && nb_handle->agg_flag == SET) {
1204
if(direct) { armci_copy(src,dst,bytes); rc=0; }
1206
rc=armci_agg_save_descriptor(src,dst,bytes,proc,GET,0,nb_handle);
1211
/*armci_wait needs proc to compute direct*/
1212
INIT_NB_HANDLE(nb_handle,PUT,proc);
1213
armci_copy(src,dst,bytes);
1217
rc=PARMCI_NbGetS(src, NULL,dst,NULL, &bytes,0,proc,uhandle);
1219
# ifdef ARMCI_NB_GET
1220
/*set tag and op in the nb handle*/
1221
INIT_NB_HANDLE(nb_handle,GET,proc);
1223
ARMCI_NB_GET(src, dst, bytes, proc, &nb_handle->cmpl_info);
1225
rc=PARMCI_NbGetS(src, NULL,dst,NULL, &bytes,0,proc,uhandle);
1229
ARMCI_PR_DBG("exit",proc);
1234
static void _armci_rem_value(int op, void *src, void *dst, int proc,
1237
int armci_th_idx = ARMCI_THREAD_IDX;
1239
ORDER(op,proc); /* ensure ordering */
1241
#if defined(REMOTE_OP) && !defined(QUADRICS)
1242
rc = armci_rem_strided(op, NULL, proc, src, NULL, dst, NULL,
1243
&bytes, 0, NULL, 0, NULL);
1244
if(rc) armci_die("ARMCI_Value: armci_rem_strided incomplete", FAIL6);
1248
UPDATE_FENCE_STATE(proc, PUT, 1);
1250
SET_COUNTER(ack_cntr[armci_th_idx], 1);
1253
/* fprintf(stderr,"bytes: %d\n",bytes); */
1254
/* this call is blocking, so local count is fine */
1257
BGML_Callback_t cb_wait={wait_callback, &count};
1258
BG1S_Memput(&req, proc, src, 0, dst, bytes, &cb_wait, 1);
1262
armci_put(src, dst, bytes, proc);
1267
SET_COUNTER(get_cntr[armci_th_idx], 1);
1270
/* fprintf(stderr,"before memget\n"); */
1273
BGML_Callback_t cb_wait={wait_callback, &count};
1274
BG1S_Memget(&req, proc, dst, 0, src, bytes, &cb_wait, 1);
1278
armci_get(src, dst, bytes, proc);
1282
/* deal with non-blocking loads and stores */
1283
# if defined(LAPI) || defined(_ELAN_PUTGET_H)
1284
if(proc != armci_me){
1286
WAIT_FOR_GETS; /* wait for data arrival */
1288
WAIT_FOR_PUTS; /* data must be copied out*/
1295
/* non-blocking remote value put/get operation */
1296
static void _armci_nb_rem_value(int op, void *src, void *dst, int proc,
1297
int bytes, armci_ihdl_t nb_handle) {
1299
int armci_th_idx = ARMCI_THREAD_IDX;
1301
if(nb_handle && nb_handle->agg_flag == SET) {
1303
(void)armci_agg_save_descriptor(src,dst,bytes,proc,op,pv,nb_handle);
1307
if(op==PUT) UPDATE_FENCE_INFO(proc);
1309
/*set tag and op in the nb handle*/
1311
nb_handle->tag = GET_NEXT_NBTAG();
1313
nb_handle->proc= proc;
1314
nb_handle->bufid=NB_NONE;
1317
nb_handle = armci_set_implicit_handle(op, proc);
1320
#if defined(REMOTE_OP) && !defined(QUADRICS)
1321
rc = armci_rem_strided(op, NULL, proc, src, NULL, dst, NULL,
1322
&bytes, 0, NULL, 0, nb_handle);
1323
if(rc) armci_die("ARMCI_Value: armci_rem_strided incomplete", FAIL6);
1327
UPDATE_FENCE_STATE(proc, PUT, 1);
1329
SET_COUNTER(ack_cntr[armci_th_idx], 1);
1331
armci_put(src, dst, bytes, proc);
1335
SET_COUNTER(get_cntr[armci_th_idx], 1);
1337
armci_get(src, dst, bytes, proc);
1340
/* deal with non-blocking loads and stores */
1341
# if defined(LAPI) || defined(_ELAN_PUTGET_H)
1346
if(proc != armci_me){
1348
WAIT_FOR_GETS; /* wait for data arrival */
1350
WAIT_FOR_PUTS; /* data must be copied out*/
1359
#define CHK_ERR(dst, proc) \
1360
if(dst==NULL) armci_die("PARMCI_PutValue: NULL pointer passed",FAIL); \
1361
if(proc<0) armci_die("PARMCI_PutValue: Invalid process rank", proc);
1363
#define CHK_ERR_GET(src, dst, proc, bytes) \
1364
if(src==NULL || dst==NULL) armci_die("PARMCI_GetValue: NULL pointer passed",FAIL); \
1365
if(proc<0) armci_die("PARMCI_GetValue: Invalid process rank", proc); \
1366
if(bytes<0) armci_die("PARMCI_GetValue: Invalid size", bytes);
1369
* Register-Originated Put.
1371
int PARMCI_PutValueInt(int src, void *dst, int proc)
1374
if( SAMECLUSNODE(proc) ) *(int *)dst = src;
1375
else _armci_rem_value(PUT, &src, dst, proc, sizeof(int));
1379
int PARMCI_PutValueLong(long src, void *dst, int proc)
1382
if( SAMECLUSNODE(proc) ) *(long *)dst = src;
1383
else _armci_rem_value(PUT, &src, dst, proc, sizeof(long));
1387
int PARMCI_PutValueFloat(float src, void *dst, int proc)
1390
if( SAMECLUSNODE(proc) ) *(float *)dst = src;
1391
else _armci_rem_value(PUT, &src, dst, proc, sizeof(float));
1395
int PARMCI_PutValueDouble(double src, void *dst, int proc)
1398
if( SAMECLUSNODE(proc) ) *(double *)dst = src;
1399
else _armci_rem_value(PUT, &src, dst, proc, sizeof(double));
1404
* Non-Blocking register-originated put.
1406
int PARMCI_NbPutValueInt(int src, void *dst, int proc, armci_hdl_t* usr_hdl)
1409
if( SAMECLUSNODE(proc) ) *(int *)dst = src;
1410
else _armci_nb_rem_value(PUT,&src,dst,proc,sizeof(int),(armci_ihdl_t)usr_hdl);
1414
int PARMCI_NbPutValueLong(long src, void *dst, int proc, armci_hdl_t* usr_hdl)
1417
if( SAMECLUSNODE(proc) ) *(long *)dst = src;
1418
else _armci_nb_rem_value(PUT,&src,dst,proc,sizeof(long),(armci_ihdl_t)usr_hdl);
1422
int PARMCI_NbPutValueFloat(float src, void *dst, int proc, armci_hdl_t* usr_hdl)
1425
if( SAMECLUSNODE(proc) ) *(float *)dst = src;
1426
else _armci_nb_rem_value(PUT,&src,dst,proc,sizeof(float),(armci_ihdl_t)usr_hdl);
1430
int PARMCI_NbPutValueDouble(double src, void *dst, int proc, armci_hdl_t* usr_hdl)
1433
if( SAMECLUSNODE(proc) ) *(double *)dst = src;
1434
else _armci_nb_rem_value(PUT,&src,dst,proc,sizeof(double),(armci_ihdl_t)usr_hdl);
1440
* Register-Originated Get.
1442
int PARMCI_GetValueInt(void *src, int proc)
1445
if( SAMECLUSNODE(proc) ) return *(int *)src;
1446
else _armci_rem_value(GET, src, &dst, proc, sizeof(int));
1450
long PARMCI_GetValueLong(void *src, int proc)
1453
if( SAMECLUSNODE(proc) ) return *(long *)src;
1454
else _armci_rem_value(GET, src, &dst, proc, sizeof(long));
1458
float PARMCI_GetValueFloat(void *src, int proc)
1461
if( SAMECLUSNODE(proc) ) return *(float *)src;
1462
else _armci_rem_value(GET, src, &dst, proc, sizeof(float));
1466
double PARMCI_GetValueDouble(void *src, int proc)
1469
if( SAMECLUSNODE(proc) ) return *(double *)src;
1470
else _armci_rem_value(GET, src, &dst, proc, sizeof(double));
1478
* Register-Originated Get.
1480
int PARMCI_GetValue(void *src, void *dst, int proc, int bytes)
1482
CHK_ERR_GET(src, dst, proc, bytes);
1483
if( SAMECLUSNODE(proc) ) { armci_copy(src, dst, bytes); }
1484
else _armci_rem_value(GET, src, dst, proc, bytes);
1489
* Non-Blocking register-originated get.
1491
int PARMCI_NbGetValue(void *src, void *dst, int proc, int bytes, armci_hdl_t* usr_hdl)
1493
CHK_ERR_GET(src, dst, proc, bytes);
1494
if( SAMECLUSNODE(proc) ) { armci_copy(src, dst, bytes); }
1495
else _armci_nb_rem_value(GET, src, dst, proc, bytes, (armci_ihdl_t)usr_hdl);