14
#define DATA_SERVER_ 1
16
// by default use the ds remote gets
17
#define ORNL_USE_DS_FOR_REMOTE_GETS
19
#ifdef ORNL_USE_DS_FOR_REMOTE_GETS
20
#define DATA_SERVER_GET_ 1
22
#define DATA_SERVER_GET_ 0
25
#define ARMCI_OP_2D(op, scale, proc, src, dst, bytes, count, src_stride, dst_stride,lockit)\
26
if(op == GET || op ==PUT)\
27
armci_copy_2D(op, proc, src, dst, bytes, count, src_stride,dst_stride);\
28
else if(count==1) armci_acc_1D(op, scale, proc, src, dst, bytes,lockit);\
30
armci_acc_2D(op, scale, proc, src, dst, bytes, count, src_stride,dst_stride,lockit)
32
/* macro supports run-time selection of request sending scheme */
33
#if defined(CLIENT_BUF_BYPASS)
34
#define CAN_REQUEST_DIRECTLY _armci_bypass
37
# define CAN_REQUEST_DIRECTLY 0
39
# define CAN_REQUEST_DIRECTLY 1
43
#define PREPROCESS_STRIDED(tmp_count) {\
46
for(;stride_levels;stride_levels--)if(count[stride_levels]>1)break;\
47
if(stride_levels&&(count[0]==src_stride_arr[0]&&count[0]==dst_stride_arr[0])){\
48
tmp_count=seg_count[1];\
50
seg_count[1] = seg_count[0] * seg_count[1];\
52
src_stride_arr ++; dst_stride_arr++ ;\
55
#define POSTPROCESS_STRIDED(tmp_count) if(tmp_count)seg_count[1]=tmp_count
58
#define SERVER_NBPUT 2
60
#define DIRECT_NBPUT 4
62
#define SERVER_NBGET 6
64
#define DIRECT_NBGET 8
65
#define ONESIDED_PUT 9
66
#define ONESIDED_GET 10
69
# define DO_FENCE(__proc,__prot) \
70
if(__prot==SERVER_GET); \
71
else if(__prot==SERVER_PUT); \
72
else if(__prot==DIRECT_GET || __prot==DIRECT_NBGET) { \
73
if(armci_prot_switch_fence[__proc]==SERVER_PUT) \
74
ARMCI_DoFence(__proc); \
76
else if(__prot==DIRECT_PUT || __prot==DIRECT_NBPUT) { \
77
if(armci_prot_switch_fence[__proc]==SERVER_PUT) \
78
ARMCI_DoFence(__proc); \
80
else if(__prot==ONESIDED_GET) { \
81
if(armci_prot_switch_fence[__proc]==SERVER_PUT) { \
82
ARMCI_DoFence(__proc); \
85
else if(__prot==ONESIDED_PUT); \
87
armci_prot_switch_fence[__proc]=__prot
89
#ifndef REGIONS_REQUIRE_MEMHDL
90
# define ARMCI_MEMHDL_T void
93
ARMCI_MEMHDL_T *mhloc=NULL,*mhrem=NULL;
95
#ifdef REGIONS_REQUIRE_MEMHDL
96
int armci_region_both_found_hndl(void *loc, void *rem, int size, int node,
97
ARMCI_MEMHDL_T **loc_memhdl,ARMCI_MEMHDL_T **rem_memhdl);
98
# define ARMCI_REGION_BOTH_FOUND(_s,_d,_b,_p) \
99
armci_region_both_found_hndl((_s),(_d),(_b),(_p),&mhloc,&mhrem)
101
# define ARMCI_REGION_BOTH_FOUND(_s,_d,_b,_p) \
102
armci_region_both_found((_s),(_d),(_b),(_p))
107
# ifdef REGIONS_REQUIRE_MEMHDL
108
void armci_client_direct_get(int p, void *src_buf, void *dst_buf, int len,
109
void** cptr,int nbtag,ARMCI_MEMHDL_T *lochdl,ARMCI_MEMHDL_T *remhdl);
111
void armci_client_direct_get(int p, void *src_buf, void *dst_buf, int len,
112
void** contextptr,int nbtag,void *mhdl,void *mhdl1);
114
# define ARMCI_NBREM_GET(_p,_s,_sst,_d,_dst,_cou,_lev,_hdl) \
115
armci_client_direct_get((_p),(_s),(_d),(_cou)[0],&((_hdl)->cmpl_info),(_hdl)->tag,(void *)mhloc,(void *)mhrem); \
117
# define ARMCI_REM_GET(_p,_s,_sst,_d,_dst,_cou,_lev,_hdl) \
118
armci_client_direct_get((_p),(_s),(_d),(_cou)[0],NULL,0,(void *)mhloc,(void *)mhrem); \
122
# define ARMCI_REM_GET(_p,_s,_sst,_d,_dst,_cou,_lev,_hdl) \
123
armci_rem_get((_p),(_s),(_sst),(_d),(_dst),(_cou),(_lev),(_hdl),(void *)mhloc,(void *)mhrem)
124
# define ARMCI_NBREM_GET ARMCI_REM_GET
128
extern int* armci_prot_switch_fence;
129
extern int armci_prot_switch_preproc;
130
extern int armci_prot_switch_preop;
133
int armci_iwork[MAX_STRIDE_LEVEL];
135
/*\ 2-dimensional array copy
137
static void armci_copy_2D(int op, int proc, void *src_ptr, void *dst_ptr,
138
int bytes, int count, int src_stride, int dst_stride)
140
int armci_th_idx = ARMCI_THREAD_IDX;
151
int shmem = SAMECLUSNODE(proc);
154
/* data is in local/shared memory -- can use memcpy */
155
// printf("%s: shmem==true; count==%d\n",Portals_ID(),count);
157
armci_copy(src_ptr, dst_ptr, bytes);
158
// printf("%s: shmem==true; finished\n",Portals_ID(),count);
160
char *ps=(char*)src_ptr;
161
char *pd=(char*)dst_ptr;
163
for (j = 0; j < count; j++){
171
/* data not in local/shared memory-access through global address space*/
175
// printf("%s: pre UPDATE_FENCE_STATE\n",Portals_ID());
176
UPDATE_FENCE_STATE(proc, PUT, COUNT);
177
// printf("%s: post UPDATE_FENCE_STATE\n",Portals_ID());
179
SET_COUNTER(ack_cntr[armci_th_idx],COUNT);
182
armci_put(src_ptr, dst_ptr, bytes, proc);
184
armci_put2D(proc, bytes, count, src_ptr, src_stride,
185
dst_ptr, dst_stride);
191
SET_COUNTER(get_cntr[armci_th_idx], COUNT);
194
armci_get(src_ptr, dst_ptr, bytes, proc);
196
armci_get2D(proc, bytes, count, src_ptr, src_stride,
197
dst_ptr, dst_stride);
204
#if (defined(CRAY) && !defined(__crayx1)) || defined(FUJITSU)
208
# define DAXPY daxpy_
212
#define THRESH_ACC 32
214
static void daxpy_2d_(void* alpha, int *rows, int *cols, void *a, int *ald,
218
double *A = (double*)a;
219
double *B = (double*)b;
220
double Alpha = *(double*)alpha;
222
if(*rows < THRESH_ACC)
225
A[c* *ald+ r] += Alpha * B[c* *bld+r];
226
else for(c=0;c<*cols;c++)
227
DAXPY(rows, alpha, B + c* *bld, &ONE, A + c* *ald, &ONE);
232
void armci_acc_1D(int op, void *scale, int proc, void *src, void *dst, int bytes, int lockit)
235
void (*func)(void*, void*, void*, int*);
236
ARMCI_PR_DBG("enter",0);
239
rows = bytes/sizeof(int);
240
func = I_ACCUMULATE_1D;
243
rows = bytes/sizeof(long);
244
func = L_ACCUMULATE_1D;
247
rows = bytes/sizeof(double);
248
func = D_ACCUMULATE_1D;
251
rows = bytes/(2*sizeof(double));
252
func = Z_ACCUMULATE_1D;
255
rows = bytes/(2*sizeof(float));
256
func = C_ACCUMULATE_1D;
259
rows = bytes/sizeof(float);
260
func = F_ACCUMULATE_1D;
262
default: armci_die("ARMCI accumulate: operation not supported",op);
263
func = F_ACCUMULATE_1D; /*avoid compiler whining */
268
ARMCI_LOCKMEM(dst, bytes + (char*)dst, proc);
270
func(scale, dst, src, &rows);
271
if(lockit)ARMCI_UNLOCKMEM(proc);
272
ARMCI_PR_DBG("exit",0);
275
/*\ 2-dimensional accumulate
277
void armci_acc_2D(int op, void* scale, int proc, void *src_ptr, void *dst_ptr,
278
int bytes, int cols, int src_stride, int dst_stride, int lockit)
280
int rows, lds, ldd, span;
281
void (*func)(void*, int*, int*, void*, int*, void*, int*);
283
ARMCI_PR_DBG("enter",0);
286
if((long)src_ptr%ALIGN)armci_die("src not aligned",(long)src_ptr);
287
if((long)dst_ptr%ALIGN)armci_die("src not aligned",(long)dst_ptr);
292
rows = bytes/sizeof(int);
293
ldd = dst_stride/sizeof(int);
294
lds = src_stride/sizeof(int);
295
func = I_ACCUMULATE_2D;
298
rows = bytes/sizeof(long);
299
ldd = dst_stride/sizeof(long);
300
lds = src_stride/sizeof(long);
301
func = L_ACCUMULATE_2D;
304
rows = bytes/sizeof(double);
305
ldd = dst_stride/sizeof(double);
306
lds = src_stride/sizeof(double);
307
func = D_ACCUMULATE_2D;
310
rows = bytes/(2*sizeof(double));
311
ldd = dst_stride/(2*sizeof(double));
312
lds = src_stride/(2*sizeof(double));
313
func = Z_ACCUMULATE_2D;
316
rows = bytes/(2*sizeof(float));
317
ldd = dst_stride/(2*sizeof(float));
318
lds = src_stride/(2*sizeof(float));
319
func = C_ACCUMULATE_2D;
322
rows = bytes/sizeof(float);
323
ldd = dst_stride/sizeof(float);
324
lds = src_stride/sizeof(float);
325
func = F_ACCUMULATE_2D;
328
rows = bytes/sizeof(long);
329
ldd = dst_stride/sizeof(long);
330
lds = src_stride/sizeof(long);
331
func = RA_ACCUMULATE_2D_;
333
default: armci_die("ARMCI accumulate: operation not supported",op);
334
func = F_ACCUMULATE_2D; /*avoid compiler whining */
339
span = cols*dst_stride;
340
ARMCI_LOCKMEM(dst_ptr, span + (char*)dst_ptr, proc);
342
func(scale, &rows, &cols, dst_ptr, &ldd, src_ptr, &lds);
343
if(lockit)ARMCI_UNLOCKMEM(proc);
344
ARMCI_PR_DBG("exit",0);
349
/*\ compute range of strided data AND lock it
352
armci_lockmem_patch(void* dst_ptr, int dst_stride_arr[], int count[], int stride_levels, int proc)
354
long span = count[stride_levels];
355
ARMCI_PR_DBG("enter",0);
356
span *= dst_stride_arr[stride_levels-1];
358
/* lock region of remote memory */
359
ARMCI_LOCKMEM(dst_ptr, span + (char*)dst_ptr, proc);
360
ARMCI_PR_DBG("exit",0);
364
/*\ strided accumulate on top of remote memory copy:
365
* copies remote data to local buffer, accumulates, puts it back
366
* Note: if we are here then remote patch must fit in the ARMCI buffer
368
int armci_acc_copy_strided(int optype, void* scale, int proc,
369
void* src_ptr, int src_stride_arr[],
370
void* dst_ptr, int dst_stride_arr[],
371
int count[], int stride_levels)
373
void *buf_ptr = armci_internal_buffer;
374
int rc, i, *buf_stride_arr = armci_iwork;
375
ARMCI_PR_DBG("enter",0);
376
armci_lockmem_patch(dst_ptr,dst_stride_arr, count, stride_levels, proc);
378
/* setup stride array for internal buffer */
379
buf_stride_arr[0]=count[0];
380
for(i=0; i< stride_levels; i++) {
381
buf_stride_arr[i+1]= buf_stride_arr[i]*count[i+1];
384
/* get remote data to local buffer */
385
rc = armci_op_strided(GET, scale, proc, dst_ptr, dst_stride_arr, buf_ptr,
386
buf_stride_arr, count, stride_levels, 0,NULL);
388
if(rc) { ARMCI_UNLOCKMEM(proc); return(rc); }
390
/* call local accumulate with lockit=0 (we locked it already) and proc=me */
391
rc = armci_op_strided(optype, scale, armci_me, src_ptr, src_stride_arr,
392
buf_ptr,buf_stride_arr, count, stride_levels,0,NULL);
393
if(rc) { ARMCI_UNLOCKMEM(proc); return(rc); }
395
/* put data back from the buffer to remote location */
396
rc = armci_op_strided(PUT, scale, proc, buf_ptr, buf_stride_arr, dst_ptr,
397
dst_stride_arr, count, stride_levels,0,NULL);
399
FENCE_NODE(proc); /* make sure put completes before unlocking */
400
ARMCI_UNLOCKMEM(proc); /* release memory lock */
401
ARMCI_PR_DBG("exit",0);
408
/*\ Strided operation
410
int armci_op_strided(int op, void* scale, int proc,void *src_ptr,
411
int src_stride_arr[], void* dst_ptr, int dst_stride_arr[],
412
int count[], int stride_levels, int lockit,
413
armci_ihdl_t nb_handle)
415
char *src = (char*)src_ptr, *dst=(char*)dst_ptr;
416
int s2, s3, i,j, unlockit=0;
418
int index[MAX_STRIDE_LEVEL], unit[MAX_STRIDE_LEVEL];
419
ARMCI_PR_DBG("enter",op);
420
# if defined(ACC_COPY)
423
if(ARMCI_ACC(op) && !(SAMECLUSNODE(proc)) )
425
if ( ARMCI_ACC(op) && proc!=armci_me)
427
/* copy remote data, accumulate, copy back*/
428
return (armci_acc_copy_strided(op,scale, proc, src_ptr, src_stride_arr,
429
dst_ptr, dst_stride_arr, count, stride_levels));
431
else; /* do it directly through shared/local memory */
435
if(ARMCI_ACC(op) && (stride_levels>2) && lockit){
436
/* we need one lock operation only - must be done outside 2d acc */
437
armci_lockmem_patch(dst_ptr,dst_stride_arr, count, stride_levels, proc);
441
/* if(proc!=armci_me) INTR_OFF;*/
442
if(armci_me>=0 && !SAMECLUSNODE(proc)) {
443
printf("%d network_strided not supported (in op_strided)\n",cos_me);
446
armci_network_strided(op,scale,proc,src_ptr,src_stride_arr,dst_ptr,
447
dst_stride_arr,count,stride_levels,nb_handle);
451
// printf("%s in large switch stmt in op_strided (stride_levels=%d)\n",Portals_ID(),stride_levels);
452
switch (stride_levels) {
453
case 0: /* 1D copy */
455
ARMCI_OP_2D(op, scale, proc, src_ptr, dst_ptr, count[0], 1,
456
count[0], count[0], lockit);
461
ARMCI_OP_2D(op, scale, proc, src_ptr, dst_ptr, count[0], count[1],
462
src_stride_arr[0], dst_stride_arr[0], lockit);
466
for (s2= 0; s2 < count[2]; s2++){ /* 2D copy */
467
ARMCI_OP_2D(op, scale, proc, src+s2*src_stride_arr[1],
468
dst+s2*dst_stride_arr[1], count[0], count[1],
469
src_stride_arr[0], dst_stride_arr[0], lockit );
474
for(s3=0; s3< count[3]; s3++){
475
src = (char*)src_ptr + src_stride_arr[2]*s3;
476
dst = (char*)dst_ptr + dst_stride_arr[2]*s3;
477
for (s2= 0; s2 < count[2]; s2++){ /* 3D copy */
478
ARMCI_OP_2D(op, scale, proc, src+s2*src_stride_arr[1],
479
dst+s2*dst_stride_arr[1],
480
count[0], count[1],src_stride_arr[0],
481
dst_stride_arr[0],lockit);
486
default: /* N-dimensional */
488
/* stride_levels is not the same as ndim. it is ndim-1
489
* For example a 10x10x10... array, suppose the datatype is byte
490
* the stride_arr is 10, 10x10, 10x10x10 ....
492
index[2] = 0; unit[2] = 1; total_of_2D = count[2];
493
for(j=3; j<=stride_levels; j++) {
494
index[j] = 0; unit[j] = unit[j-1] * count[j-1];
495
total_of_2D *= count[j];
498
for(i=0; i<total_of_2D; i++) {
499
src = (char *)src_ptr; dst = (char *)dst_ptr;
500
for(j=2; j<=stride_levels; j++) {
501
src += index[j] * src_stride_arr[j-1];
502
dst += index[j] * dst_stride_arr[j-1];
504
if(((i+1) % unit[j]) == 0) index[j]++;
505
if(index[j] >= count[j]) index[j] = 0;
508
ARMCI_OP_2D(op, scale, proc, src, dst, count[0], count[1],
509
src_stride_arr[0], dst_stride_arr[0], lockit);
516
// printf("%s after switch stmt; prior to fence/lock\n",Portals_ID());
519
# if defined(ACC_COPY)
522
ARMCI_UNLOCKMEM(proc); /* release memory lock */
525
// printf("%s after fence/lock; leaving op_strided\n",Portals_ID());
526
ARMCI_PR_DBG("exit",op);
531
int PARMCI_PutS( void *src_ptr, /* pointer to 1st segment at source*/
532
int src_stride_arr[], /* array of strides at source */
533
void* dst_ptr, /* pointer to 1st segment at destination*/
534
int dst_stride_arr[], /* array of strides at destination */
535
int seg_count[], /* number of segments at each stride
536
levels: count[0]=bytes*/
537
int stride_levels, /* number of stride levels */
538
int proc /* remote process(or) ID */
542
int *count=seg_count, tmp_count=0;
544
ARMCI_PR_DBG("enter",proc);
545
if(src_ptr == NULL || dst_ptr == NULL) return FAIL;
546
if(seg_count[0]<0)return FAIL3;
547
if(stride_levels <0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4;
548
if(proc<0)return FAIL5;
550
ORDER(PUT,proc); /* ensure ordering */
551
PREPROCESS_STRIDED(tmp_count);
554
if(stride_levels) direct=SAMECLUSNODE(proc);
555
direct=SAMECLUSNODE(proc);
558
// printf("%s direct=%d, proc=%d\n",Portals_ID(),direct,proc);
562
# ifdef CRAY_REGISTER_ARMCI_MALLOC
563
if (stride_levels == 0 && armci_onesided_direct_put_enabled)
565
// maybe move this to just before the operation happen of even after it happens
566
// it's possible we may want to skip the ONESIDED_PUT and do a SERVER_PUT if we can't find the remote mdh
567
DO_FENCE(proc,SERVER_PUT);
569
// local variable within stride_level == 0 scope
570
cos_desc_t *comm_desc = &__global_1sided_direct_comm_desc;
571
onesided_hnd_t cp_hnd;
572
cos_mdesc_t local_mdh, remote_mdh;
575
armci_onesided_search_remote_mdh_list(dst_ptr, proc, &remote_mdh);
577
// register local memory -- this should use abhinav's dreg routines
578
cpMemRegister(src_ptr, count[0], &local_mdh);
579
// onesided_mem_register(cp_hnd, src_ptr, count[0], NULL, &local_mdh);
581
// get the onesided v2.0 api handle for the compute process
582
cpGetOnesidedHandle(&cp_hnd);
584
// initialize onesided communication descriptor
585
onesided_desc_init(cp_hnd, &local_mdh, &remote_mdh, 0, comm_desc);
588
onesided_put_nb(comm_desc);
590
// complete put [locally]
591
onesided_wait(comm_desc);
593
// deregister memory -- if we were using the dreg routines, we would let the
594
// dreg memory do this for us "on demand" = lazy mem deregisteration
595
cpMemDeregister(&local_mdh);
596
// onesided_mem_deregister(cp_hnd, &local_mdh);
598
// issue a flushing get - does nothing to fix the fence problem
600
static long flushaddr = 911;
601
cpMemRegister(&flushaddr, sizeof(long), &local_mdh);
602
onesided_desc_init(cp_hnd, &local_mdh, &remote_mdh, NULL, comm_desc);
603
onesided_get_nb(comm_desc);
604
onesided_wait(comm_desc);
605
cpMemDeregister(&local_mdh);
614
DO_FENCE(proc,SERVER_PUT);
615
// printf("%s calling pack_strided in PARMCI_PutS\n",Portals_ID());
616
rc = armci_pack_strided(PUT, NULL, proc, src_ptr, src_stride_arr,dst_ptr,
617
dst_stride_arr, count, stride_levels, NULL, -1, -1, -1,NULL);
622
if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_PUT);
623
// printf("%s calling op_strided in PARMCI_PutS\n",Portals_ID());
624
rc = armci_op_strided( PUT, NULL, proc, src_ptr, src_stride_arr,
625
dst_ptr, dst_stride_arr,count,stride_levels,
628
POSTPROCESS_STRIDED(tmp_count);
631
armci_profile_stop_strided(ARMCI_PROF_PUTS);
635
ARMCI_PR_DBG("exit",proc);
642
int PARMCI_PutS_flag(
643
void* src_ptr, /* pointer to 1st segment at source */
644
int src_stride_arr[], /* array of strides at source */
645
void* dst_ptr, /* pointer to 1st segment at destination */
646
int dst_stride_arr[], /* array of strides at destination */
647
int count[], /* number of units at each stride level,
649
int stride_levels, /* number of stride levels */
650
int *flag, /* pointer to remote flag */
651
int val, /* value to set flag upon completion of
653
int proc /* remote process(or) ID */
657
/* Put local data on remote processor */
658
PARMCI_PutS(src_ptr, src_stride_arr, dst_ptr, dst_stride_arr,
659
count, stride_levels, proc);
661
/* Send signal to remote processor that data transfer has
664
PARMCI_Put(&val, flag, bytes, proc);
669
int PARMCI_Put_flag(void *src, void* dst,int bytes,int *f,int v,int proc) {
670
return PARMCI_PutS_flag(src, NULL, dst, NULL, &bytes, 0, f, v, proc);
674
int PARMCI_PutS_flag_dir(void *src_ptr, int src_stride_arr[],
675
void* dst_ptr, int dst_stride_arr[],
676
int seg_count[], int stride_levels,
677
int *flag, int val, int proc) {
678
return PARMCI_PutS_flag(src_ptr, src_stride_arr,dst_ptr,dst_stride_arr,
679
seg_count, stride_levels, flag, val, proc);
683
int PARMCI_GetS( void *src_ptr, /* pointer to 1st segment at source*/
684
int src_stride_arr[], /* array of strides at source */
685
void* dst_ptr, /* 1st segment at destination*/
686
int dst_stride_arr[], /* array of strides at destination */
687
int seg_count[], /* number of segments at each stride
688
levels: count[0]=bytes*/
689
int stride_levels, /* number of stride levels */
690
int proc /* remote process(or) ID */
694
int *count=seg_count, tmp_count=0;
695
ARMCI_PR_DBG("enter",proc);
697
#ifdef ARMCI_ONESIDED_GETS_USES_NBGETS
698
armci_hdl_t nb_handle;
699
ARMCI_INIT_HANDLE(&nb_handle);
700
PARMCI_NbGetS(src_ptr, src_stride_arr, dst_ptr, dst_stride_arr, seg_count,
701
stride_levels, proc, &nb_handle);
702
rc = PARMCI_Wait(&nb_handle);
704
if(src_ptr == NULL || dst_ptr == NULL) return FAIL;
705
if(seg_count[0]<0)return FAIL3;
706
if(stride_levels <0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4;
707
if(proc<0||proc>=armci_nproc){printf("\n%d:%s:proc=%d",armci_me,__FUNCTION__,proc);fflush(stdout);return FAIL5;}
709
ORDER(GET,proc); /* ensure ordering */
710
PREPROCESS_STRIDED(tmp_count);
712
direct=SAMECLUSNODE(proc);
714
# ifdef CRAY_REGISTER_ARMCI_MALLOC
715
if(!direct && armci_onesided_direct_get_enabled)
718
onesided_hnd_t cp_hnd;
719
cpGetOnesidedHandle(&cp_hnd);
720
cos_mdesc_t local_mdh, remote_mdh, *mdh = NULL;
721
int node = armci_clus_id(proc);
723
if(stride_levels == 0)
726
// if a strided put/acc is outstanding to proc, then we need to ensure that is completed
727
// we allow the maximum possible overlap for strided puts/acc. that means they are not fully blocking
728
// calls. they are however, guaranteed to be complete prior to another request being sent.
729
DO_FENCE(proc,ONESIDED_GET);
732
cos_desc_t *comm_desc = &__global_1sided_direct_get_comm_desc;
733
// printf("[cp %d]: direct remote get - src=%p; dst=%p; tgt_rank=%d; tgt_node=%d\n",armci_me,src_ptr,dst_ptr,proc,node);
736
armci_onesided_search_remote_mdh_list(src_ptr, proc, &remote_mdh);
738
// register local memory -- will use UDREG if ONESIDED_USE_UDREG is active
739
cpMemRegister(dst_ptr, count[0], &local_mdh);
740
// onesided_mem_register(cp_hnd, src_ptr, count[0], NULL, &local_mdh);
742
// initialize onesided communication descriptor
743
onesided_desc_init(cp_hnd, &local_mdh, &remote_mdh, 0, comm_desc);
746
onesided_get_nb(comm_desc);
748
// complete put [locally]
749
onesided_wait(comm_desc);
751
// deregister memory -- if we were using the dreg routines, we would let the
752
// dreg memory do this for us "on demand" = lazy mem deregisteration
753
cpMemDeregister(&local_mdh);
754
// onesided_mem_deregister(cp_hnd, &local_mdh);
762
DO_FENCE(proc,ONESIDED_GET);
765
long src_idx; /* index offset of the current block position to src_ptr */
766
long dst_idx; /* index offset of the current block position to dst_ptr */
767
int n1dim; /* number of 1-dimensional blocks to xfer */
768
int bvalue[MAX_STRIDE_LEVEL];
769
int bunit[MAX_STRIDE_LEVEL];
770
cos_desc_t cds[MAX_OUTSTANDING_ONESIDED_GETS];
771
uint64_t src_addr, dst_addr;
774
for(i=1; i<=stride_levels; i++) {
778
bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
779
for(i=2; i<=stride_levels; i++)
782
bunit[i] = bunit[i-1] * count[i-1];
785
for(i=0,id=0; i<n1dim; i++)
789
for(j=1; j<=stride_levels; j++)
791
src_idx += bvalue[j] * src_stride_arr[j-1];
792
dst_idx += bvalue[j] * dst_stride_arr[j-1];
793
if((i+1) % bunit[j] == 0) bvalue[j]++;
794
if(bvalue[j] > (count[j]-1)) bvalue[j] = 0;
797
src_addr = (uint64_t) ((char *) src_ptr + src_idx);
798
dst_addr = (uint64_t) ((char *) dst_ptr + dst_idx);
800
if(i >= MAX_OUTSTANDING_ONESIDED_GETS)
802
if(id == MAX_OUTSTANDING_ONESIDED_GETS) id=0;
803
onesided_wait(&cds[id]);
804
cpMemDeregister(&cds[id].local_mdesc);
807
armci_onesided_search_remote_mdh_list((void*)src_addr, proc, &remote_mdh);
808
cpMemRegister((void*)dst_addr, count[0], &local_mdh);
809
onesided_desc_init(cp_hnd, &local_mdh, &remote_mdh, 0, &cds[id]);
810
onesided_get_nb(&cds[id]);
814
// finish up any outstanding requests
816
if(MAX_OUTSTANDING_ONESIDED_GETS < n1dim) count = MAX_OUTSTANDING_ONESIDED_GETS;
817
for(i=0; i<count; i++)
821
onesided_wait(&cds[i]);
822
cpMemDeregister(&cds[i].local_mdesc);
830
} // end if(!direct && armci_onesided_direct_get_enabled)
831
# endif // CRAY_REGISTER_ARMCI_MALLOC
834
DO_FENCE(proc,SERVER_GET);
835
rc = armci_pack_strided(GET, NULL, proc, src_ptr, src_stride_arr,
836
dst_ptr,dst_stride_arr,count,stride_levels,
839
if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_GET);
840
rc = armci_op_strided(GET, NULL, proc, src_ptr, src_stride_arr, dst_ptr,
841
dst_stride_arr,count, stride_levels,0,NULL);
844
POSTPROCESS_STRIDED(tmp_count);
847
armci_profile_stop_strided(ARMCI_PROF_GETS);
849
#endif // ifdef/else ARMCI_ONESIDED_GETS_USES_NBGETS
852
ARMCI_PR_DBG("exit",proc);
858
int PARMCI_AccS( int optype, /* operation */
859
void *scale, /* scale factor x += scale*y */
860
void *src_ptr, /* pointer to 1st segment at source*/
861
int src_stride_arr[], /* array of strides at source */
862
void* dst_ptr, /* 1st segment at destination*/
863
int dst_stride_arr[], /* array of strides at destination */
864
int seg_count[], /* number of segments at each stride
865
levels: count[0]=bytes*/
866
int stride_levels, /* number of stride levels */
867
int proc /* remote process(or) ID */
871
int *count=seg_count, tmp_count=0;
873
ARMCI_PR_DBG("enter",proc);
874
if(src_ptr == NULL || dst_ptr == NULL) return FAIL;
875
if(src_stride_arr == NULL || dst_stride_arr ==NULL) return FAIL2;
876
if(count[0]<0)return FAIL3;
877
if(stride_levels <0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4;
878
if(proc<0)return FAIL5;
880
ORDER(optype,proc); /* ensure ordering */
881
PREPROCESS_STRIDED(tmp_count);
884
direct=SAMECLUSNODE(proc);
887
# if defined(ACC_COPY) && !defined(ACC_SMP)
888
if(armci_me != proc) direct=0;
891
rc = armci_op_strided(optype,scale, proc, src_ptr, src_stride_arr,dst_ptr,
892
dst_stride_arr, count, stride_levels,1,NULL);
894
DO_FENCE(proc,SERVER_PUT);
895
rc = armci_pack_strided(optype,scale,proc,src_ptr, src_stride_arr,dst_ptr,
896
dst_stride_arr,count,stride_levels,NULL,-1,-1,-1,NULL);
898
POSTPROCESS_STRIDED(tmp_count);
899
ARMCI_PR_DBG("exit",proc);
904
int PARMCI_Acc(int optype, void *scale, void *src, void* dst, int bytes, int proc) {
906
rc = PARMCI_AccS(optype, scale, src, NULL, dst, NULL, &bytes, 0, proc);
911
whatever original put and get functions were here have been
912
replaced with the proper ones from the main armci branch.
913
the old functions were entirely responsible for causing the
914
test_vector_acc test to fail in test.x
917
int PARMCI_Put(void *src, void* dst, int bytes, int proc) {
919
//ARMCI_PROFILE_START_STRIDED(&bytes, 0, proc, ARMCI_PROF_PUT);
920
rc = PARMCI_PutS(src, NULL, dst, NULL, &bytes, 0, proc);
921
//ARMCI_PROFILE_STOP_STRIDED(ARMCI_PROF_PUT);
926
int PARMCI_Get(void *src, void* dst, int bytes, int proc) {
928
//ARMCI_PROFILE_START_STRIDED(&bytes, 0, proc, ARMCI_PROF_GET);
931
memcpy(dst,src,bytes);
933
rc = PARMCI_GetS(src, NULL, dst, NULL, &bytes, 0, proc);
935
//ARMCI_PROFILE_STOP_STRIDED(ARMCI_PROF_GET);
944
# define armci_read_strided1 armci_read_strided
945
# define armci_write_strided1 armci_write_strided
947
# define armci_read_strided2 armci_read_strided
948
# define armci_write_strided2 armci_write_strided
951
void armci_write_strided1(void *ptr, int stride_levels, int stride_arr[],
952
int count[], char *buf)
955
long idx; /* index offset of current block position to ptr */
956
int n1dim; /* number of 1 dim block */
957
int bvalue[MAX_STRIDE_LEVEL], bunit[MAX_STRIDE_LEVEL];
958
int bytes = count[0];
959
ARMCI_PR_DBG("enter",stride_levels);
961
/* number of n-element of the first dimension */
963
for(i=1; i<=stride_levels; i++)
966
/* calculate the destination indices */
967
bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
968
for(i=2; i<=stride_levels; i++) {
970
bunit[i] = bunit[i-1] * count[i-1];
973
for(i=0; i<n1dim; i++) {
975
for(j=1; j<=stride_levels; j++) {
976
idx += bvalue[j] * stride_arr[j-1];
977
if((i+1) % bunit[j] == 0) bvalue[j]++;
978
if(bvalue[j] > (count[j]-1)) bvalue[j] = 0;
981
armci_copy( ((char*)ptr)+idx, buf, bytes);
984
ARMCI_PR_DBG("exit",stride_levels);
988
void armci_write_strided2(void *ptr, int stride_levels, int stride_arr[],
989
int count[], char *buf)
992
int total; /* number of 2 dim block */
993
int index[MAX_STRIDE_LEVEL], unit[MAX_STRIDE_LEVEL];
994
ARMCI_PR_DBG("enter",stride_levels);
996
if(stride_levels == 0){
997
armci_copy( ptr, buf, count[0]);
998
}else if (count[0]%ALIGN_SIZE || (unsigned long)ptr%ALIGN_SIZE )
999
armci_write_strided1(ptr,stride_levels, stride_arr,count,buf);
1001
int rows, ld, idx, ldd;
1004
ld = stride_arr[0]/8;
1005
switch(stride_levels){
1007
DCOPY21(&rows, count+1, ptr, &ld, buf, &idx);
1010
ldd = stride_arr[1]/stride_arr[0];
1011
DCOPY31(&rows, count+1, count+2, ptr, &ld, &ldd, buf,&idx);
1015
index[2] = 0; unit[2] = 1; total = count[2];
1016
for(j=3; j<=stride_levels; j++) {
1017
index[j] = 0; unit[j] = unit[j-1] * count[j-1];
1020
for(i=0; i<total; i++) {
1022
for(j=2; j<=stride_levels; j++) {
1023
src += index[j] * stride_arr[j-1];
1024
if(((i+1) % unit[j]) == 0) index[j]++;
1025
if(index[j] >= count[j]) index[j] = 0;
1027
DCOPY21(&rows, count+1,src, &ld, buf, &idx);
1028
buf = (char*) ((double*)buf + idx);
1032
ARMCI_PR_DBG("exit",stride_levels);
1036
void armci_read_strided1(void *ptr, int stride_levels, int stride_arr[],
1037
int count[], char *buf)
1040
long idx; /* index offset of current block position to ptr */
1041
int n1dim; /* number of 1 dim block */
1042
int bvalue[MAX_STRIDE_LEVEL], bunit[MAX_STRIDE_LEVEL];
1043
int bytes = count[0];
1045
ARMCI_PR_DBG("enter",stride_levels);
1046
/* number of n-element of the first dimension */
1048
for(i=1; i<=stride_levels; i++)
1051
/* calculate the destination indices */
1052
bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
1053
for(i=2; i<=stride_levels; i++) {
1055
bunit[i] = bunit[i-1] * count[i-1];
1058
for(i=0; i<n1dim; i++) {
1060
for(j=1; j<=stride_levels; j++) {
1061
idx += bvalue[j] * stride_arr[j-1];
1062
if((i+1) % bunit[j] == 0) bvalue[j]++;
1063
if(bvalue[j] > (count[j]-1)) bvalue[j] = 0;
1066
armci_copy(buf, ((char*)ptr)+idx,bytes);
1069
ARMCI_PR_DBG("exit",stride_levels);
1073
void armci_read_strided2(void *ptr, int stride_levels, int stride_arr[],
1074
int count[], char *buf)
1077
int total; /* number of 2 dim block */
1078
int index[MAX_STRIDE_LEVEL], unit[MAX_STRIDE_LEVEL];
1080
ARMCI_PR_DBG("enter",stride_levels);
1081
if(stride_levels == 0){
1082
armci_copy( buf, ptr, count[0]);
1083
}else if (count[0]%ALIGN_SIZE || (unsigned long)ptr%ALIGN_SIZE)
1084
armci_read_strided1(ptr,stride_levels, stride_arr,count,buf);
1086
int rows, ld, idx, ldd;
1089
ld = stride_arr[0]/8;
1090
switch(stride_levels){
1092
DCOPY12(&rows, count+1, ptr, &ld, buf, &idx);
1095
ldd = stride_arr[1]/stride_arr[0];
1096
DCOPY13(&rows, count+1, count+2, ptr, &ld, &ldd, buf,&idx);
1099
index[2] = 0; unit[2] = 1; total = count[2];
1100
for(j=3; j<=stride_levels; j++) {
1101
index[j] = 0; unit[j] = unit[j-1] * count[j-1];
1104
for(i=0; i<total; i++) {
1106
for(j=2; j<=stride_levels; j++) {
1107
src += index[j] * stride_arr[j-1];
1108
if(((i+1) % unit[j]) == 0) index[j]++;
1109
if(index[j] >= count[j]) index[j] = 0;
1111
DCOPY12(&rows, count+1,src, &ld, buf, &idx);
1112
buf = (char*) ((double*)buf + idx);
1116
ARMCI_PR_DBG("exit",stride_levels);
1121
int PARMCI_NbPutS( void *src_ptr, /* pointer to 1st segment at source*/
1122
int src_stride_arr[], /* array of strides at source */
1123
void* dst_ptr, /* pointer to 1st segment at destination*/
1124
int dst_stride_arr[], /* array of strides at destination */
1125
int seg_count[], /* number of segments at each stride
1126
levels: count[0]=bytes*/
1127
int stride_levels, /* number of stride levels */
1128
int proc, /* remote process(or) ID */
1129
armci_hdl_t* usr_hdl /* armci non-blocking call handle*/
1132
armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl;
1133
int *count=seg_count, tmp_count=0;
1135
ARMCI_PR_DBG("enter",proc);
1136
if(src_ptr == NULL || dst_ptr == NULL) return FAIL;
1137
if(count[0]<0)return FAIL3;
1138
if(stride_levels <0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4;
1139
if(proc<0)return FAIL5;
1141
PREPROCESS_STRIDED(tmp_count);
1144
if(stride_levels)direct=SAMECLUSNODE(proc);
1145
direct=SAMECLUSNODE(proc);
1149
if(nb_handle && nb_handle->agg_flag == SET) {
1151
rc= armci_agg_save_strided_descriptor(src_ptr, src_stride_arr,
1152
dst_ptr, dst_stride_arr,
1153
count, stride_levels, proc,
1155
POSTPROCESS_STRIDED(tmp_count);
1160
UPDATE_FENCE_INFO(proc);
1162
/*set tag and op in the nb handle*/
1164
nb_handle->tag = GET_NEXT_NBTAG();
1165
nb_handle->op = PUT;
1166
nb_handle->proc= proc;
1167
nb_handle->bufid=NB_NONE;
1170
nb_handle = armci_set_implicit_handle(PUT, proc);
1174
DO_FENCE(proc,SERVER_NBPUT);
1175
rc = armci_pack_strided(PUT, NULL, proc, src_ptr, src_stride_arr,dst_ptr,
1176
dst_stride_arr, count, stride_levels,NULL,-1,-1,-1,nb_handle);
1179
if(!SAMECLUSNODE(proc))DO_FENCE(proc,DIRECT_NBPUT);
1180
rc = armci_op_strided( PUT, NULL, proc, src_ptr, src_stride_arr,
1181
dst_ptr,dst_stride_arr,count,stride_levels, 0,nb_handle);
1184
POSTPROCESS_STRIDED(tmp_count);
1185
ARMCI_PR_DBG("exit",proc);
1186
if(rc) return FAIL6;
1190
int PARMCI_NbGetS( void *src_ptr, /* pointer to 1st segment at source*/
1191
int src_stride_arr[], /* array of strides at source */
1192
void* dst_ptr, /* 1st segment at destination*/
1193
int dst_stride_arr[], /* array of strides at destination */
1194
int seg_count[], /* number of segments at each stride
1195
levels: byte_count[0]=bytes*/
1196
int stride_levels, /* number of stride levels */
1197
int proc, /* remote process(or) ID */
1198
armci_hdl_t* usr_hdl /* armci non-blocking call handle*/
1202
int *count=seg_count, tmp_count=0;
1203
armci_ihdl_t nb_handle = (armci_ihdl_t) usr_hdl;
1204
ARMCI_PR_DBG("enter",proc);
1206
if(src_ptr == NULL || dst_ptr == NULL) return FAIL;
1207
if(seg_count[0]<0) return FAIL3;
1208
if(stride_levels<0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4;
1209
if(proc<0 || proc>=armci_nproc) {
1210
printf("\n%d:%s:proc=%d",armci_me,__FUNCTION__,proc);
1215
// ORDER(GET,proc); /* ensure ordering */
1216
PREPROCESS_STRIDED(tmp_count);
1217
direct = SAMECLUSNODE(proc); // direct ==> local on node operation
1219
if(nb_handle && nb_handle->agg_flag == SET) {
1221
rc= armci_agg_save_strided_descriptor(src_ptr, src_stride_arr,
1222
dst_ptr, dst_stride_arr,
1223
count, stride_levels, proc,
1225
POSTPROCESS_STRIDED(tmp_count);
1229
// ORDER(GET,proc); ensure ordering
1230
// set tag and op in the nb handle
1232
nb_handle->tag = GET_NEXT_NBTAG();
1233
nb_handle->op = GET;
1234
nb_handle->proc= proc;
1235
nb_handle->bufid=NB_NONE;
1238
nb_handle = armci_set_implicit_handle(GET, proc);
1242
nb_handle->onesided_direct = 0;
1245
# ifdef CRAY_REGISTER_ARMCI_MALLOC
1246
if(!direct && armci_onesided_direct_get_enabled)
1248
// set up the non-blocking descriptor
1249
nb_handle->onesided_direct = 1;
1250
bzero(&nb_handle->comm_desc, MAX_OUTSTANDING_ONESIDED_GETS*sizeof(cos_desc_t));
1252
onesided_hnd_t cp_hnd;
1253
cpGetOnesidedHandle(&cp_hnd);
1254
cos_mdesc_t local_mdh, remote_mdh, *mdh = NULL;
1255
int node = armci_clus_id(proc);
1257
if(stride_levels == 0)
1260
// if a strided put/acc is outstanding to proc, then we need to ensure that is completed
1261
// we allow the maximum possible overlap for strided puts/acc. that means they are not fully blocking
1262
// calls. they are however, guaranteed to be complete prior to another request being sent.
1263
DO_FENCE(proc,ONESIDED_GET);
1266
cos_desc_t *comm_desc = &nb_handle->comm_desc[0];
1267
// printf("[cp %d]: direct remote get - src=%p; dst=%p; tgt_rank=%d; tgt_node=%d\n",armci_me,src_ptr,dst_ptr,proc,node);
1270
armci_onesided_search_remote_mdh_list(src_ptr, proc, &remote_mdh);
1272
// register local memory -- will use UDREG if ONESIDED_USE_UDREG is active
1273
cpMemRegister(dst_ptr, count[0], &local_mdh);
1274
// onesided_mem_register(cp_hnd, src_ptr, count[0], NULL, &local_mdh);
1276
// initialize onesided communication descriptor
1277
onesided_desc_init(cp_hnd, &local_mdh, &remote_mdh, 0, comm_desc);
1280
onesided_get_nb(comm_desc);
1288
DO_FENCE(proc,ONESIDED_GET);
1291
long src_idx; /* index offset of the current block position to src_ptr */
1292
long dst_idx; /* index offset of the current block position to dst_ptr */
1293
int n1dim; /* number of 1-dimensional blocks to xfer */
1294
int bunit[MAX_STRIDE_LEVEL];
1295
int bvalue[MAX_STRIDE_LEVEL];
1296
cos_desc_t *cds = nb_handle->comm_desc;
1297
uint64_t src_addr, dst_addr;
1300
for(i=1; i<=stride_levels; i++) {
1304
bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
1305
for(i=2; i<=stride_levels; i++)
1308
bunit[i] = bunit[i-1] * count[i-1];
1311
for(i=0,id=0; i<n1dim; i++)
1315
for(j=1; j<=stride_levels; j++)
1317
src_idx += bvalue[j] * src_stride_arr[j-1];
1318
dst_idx += bvalue[j] * dst_stride_arr[j-1];
1319
if((i+1) % bunit[j] == 0) bvalue[j]++;
1320
if(bvalue[j] > (count[j]-1)) bvalue[j] = 0;
1323
src_addr = (uint64_t) ((char *) src_ptr + src_idx);
1324
dst_addr = (uint64_t) ((char *) dst_ptr + dst_idx);
1327
printf("1dpass=%d of %d; src_idx=%d; dst_idx=%d; count[0]=%d\n",i,n1dim,src_idx, dst_idx,count[0]);
1330
if(i >= MAX_OUTSTANDING_ONESIDED_GETS)
1332
if(id == MAX_OUTSTANDING_ONESIDED_GETS) id=0;
1333
onesided_wait(&cds[id]);
1334
cpMemDeregister(&cds[id].local_mdesc);
1337
armci_onesided_search_remote_mdh_list((void*)src_addr, proc, &remote_mdh);
1338
cpMemRegister((void*)dst_addr, count[0], &local_mdh);
1339
onesided_desc_init(cp_hnd, &local_mdh, &remote_mdh, 0, &cds[id]);
1340
onesided_get_nb(&cds[id]);
1348
} // end if(!direct && armci_onesided_direct_get_enabled)
1349
# endif // CRAY_REGISTER_ARMCI_MALLOC
1352
DO_FENCE(proc,SERVER_NBGET);
1353
rc = armci_pack_strided(GET, NULL, proc, src_ptr, src_stride_arr,
1354
dst_ptr,dst_stride_arr,count,stride_levels,
1355
NULL,-1,-1,-1,nb_handle);
1357
// DO_FENCE(proc,DIRECT_GET);
1358
rc = armci_op_strided(GET, NULL, proc, src_ptr, src_stride_arr, dst_ptr,
1359
dst_stride_arr,count, stride_levels,0,nb_handle);
1362
POSTPROCESS_STRIDED(tmp_count);
1365
#ifdef ARMCI_PROFILE
1366
armci_profile_stop_strided(ARMCI_PROF_GETS);
1368
ARMCI_PR_DBG("exit",proc);
1369
if(rc) return FAIL6;
1374
int PARMCI_NbAccS( int optype, /* operation */
1375
void *scale, /* scale factor x += scale*y */
1376
void *src_ptr, /* pointer to 1st segment at source*/
1377
int src_stride_arr[], /* array of strides at source */
1378
void* dst_ptr, /* 1st segment at destination*/
1379
int dst_stride_arr[], /* array of strides at destination */
1380
int seg_count[], /* number of segments at each stride
1381
levels: count[0]=bytes*/
1382
int stride_levels, /* number of stride levels */
1383
int proc, /* remote process(or) ID */
1384
armci_hdl_t* usr_hdl /* armci non-blocking call handle*/
1387
armci_ihdl_t nb_handle = (armci_ihdl_t)usr_hdl;
1388
int *count=seg_count, tmp_count=0;
1391
ARMCI_PR_DBG("enter",proc);
1392
if(src_ptr == NULL || dst_ptr == NULL) return FAIL;
1393
if(src_stride_arr == NULL || dst_stride_arr ==NULL) return FAIL2;
1394
if(count[0]<0)return FAIL3;
1395
if(stride_levels <0 || stride_levels > MAX_STRIDE_LEVEL) return FAIL4;
1396
if(proc<0)return FAIL5;
1398
UPDATE_FENCE_INFO(proc);
1399
PREPROCESS_STRIDED(tmp_count);
1402
direct=SAMECLUSNODE(proc);
1405
# if defined(ACC_COPY) && !defined(ACC_SMP)
1406
if(armci_me != proc) direct=0;
1409
/*set tag and op in the nb handle*/
1411
nb_handle->tag = GET_NEXT_NBTAG();
1412
nb_handle->op = optype;
1413
nb_handle->proc= proc;
1414
nb_handle->bufid=NB_NONE;
1417
nb_handle = armci_set_implicit_handle(optype, proc);
1421
rc = armci_op_strided(optype,scale, proc, src_ptr, src_stride_arr,dst_ptr,
1422
dst_stride_arr, count, stride_levels,1,NULL);
1425
DO_FENCE(proc,SERVER_NBPUT);
1426
rc = armci_pack_strided(optype,scale,proc,src_ptr, src_stride_arr,dst_ptr,
1427
dst_stride_arr,count,stride_levels,NULL,-1,-1,-1,nb_handle);
1430
POSTPROCESS_STRIDED(tmp_count);
1432
ARMCI_PR_DBG("exit",proc);
1433
if(rc) return FAIL6;
1438
#if !defined(ACC_COPY)&&!defined(CRAY_YMP)&&!defined(CYGNUS)&&!defined(CYGWIN) &&!defined(BGML)
1442
#define INIT_NB_HANDLE(nb,o,p) if(nb){\
1444
(nb)->op = (o); (nb)->proc= (p);\
1445
(nb)->bufid=NB_NONE;}\
1446
else { (nb)=armci_set_implicit_handle(o, p); (nb)->tag=0; }
1448
void set_nbhandle(armci_ihdl_t *nbh, armci_hdl_t *nb_handle, int op,
1453
*nbh=(armci_ihdl_t)nb_handle;
1457
*nbh=armci_set_implicit_handle(op, proc);
1462
int PARMCI_NbPut(void *src, void* dst, int bytes, int proc,armci_hdl_t* uhandle)
1466
armci_ihdl_t nb_handle = (armci_ihdl_t)uhandle;
1467
ARMCI_PR_DBG("enter",proc);
1469
if(src == NULL || dst == NULL) return FAIL;
1471
direct =SAMECLUSNODE(proc);
1474
if(nb_handle && nb_handle->agg_flag == SET) {
1475
if(direct) { armci_copy(src,dst,bytes); rc=0; }
1477
rc=armci_agg_save_descriptor(src,dst,bytes,proc,PUT,0,nb_handle);
1482
/*armci_wait needs proc to compute direct*/
1483
INIT_NB_HANDLE(nb_handle,PUT,proc);
1484
armci_copy(src,dst,bytes);
1488
rc=PARMCI_NbPutS(src, NULL,dst,NULL, &bytes,0,proc,uhandle);
1490
# ifdef ARMCI_NB_PUT
1491
INIT_NB_HANDLE(nb_handle,PUT,proc);
1492
UPDATE_FENCE_STATE(proc, PUT, 1);
1493
ARMCI_NB_PUT(src, dst, bytes, proc, &nb_handle->cmpl_info);
1495
rc=PARMCI_NbPutS(src, NULL,dst,NULL, &bytes,0,proc,uhandle);
1500
ARMCI_PR_DBG("exit",proc);
1505
int PARMCI_NbGet(void *src, void* dst, int bytes, int proc,armci_hdl_t* uhandle)
1509
armci_ihdl_t nb_handle = (armci_ihdl_t)uhandle;
1510
ARMCI_PR_DBG("enter",proc);
1512
if(src == NULL || dst == NULL) return FAIL;
1514
direct =SAMECLUSNODE(proc);
1516
if(nb_handle && nb_handle->agg_flag == SET) {
1517
if(direct) { armci_copy(src,dst,bytes); rc=0; }
1519
rc=armci_agg_save_descriptor(src,dst,bytes,proc,GET,0,nb_handle);
1524
/*armci_wait needs proc to compute direct*/
1525
INIT_NB_HANDLE(nb_handle,PUT,proc);
1526
armci_copy(src,dst,bytes);
1530
rc=PARMCI_NbGetS(src, NULL,dst,NULL, &bytes,0,proc,uhandle);
1532
# ifdef ARMCI_NB_GET
1533
/*set tag and op in the nb handle*/
1534
INIT_NB_HANDLE(nb_handle,GET,proc);
1536
ARMCI_NB_GET(src, dst, bytes, proc, &nb_handle->cmpl_info);
1538
rc=PARMCI_NbGetS(src, NULL,dst,NULL, &bytes,0,proc,uhandle);
1542
ARMCI_PR_DBG("exit",proc);
1547
static void _armci_rem_value(int op, void *src, void *dst, int proc,
1550
int armci_th_idx = ARMCI_THREAD_IDX;
1552
ORDER(op,proc); /* ensure ordering */
1554
#if defined(REMOTE_OP) && !defined(QUADRICS)
1555
rc = armci_rem_strided(op, NULL, proc, src, NULL, dst, NULL,
1556
&bytes, 0, NULL, 0, NULL);
1557
if(rc) armci_die("ARMCI_Value: armci_rem_strided incomplete", FAIL6);
1561
UPDATE_FENCE_STATE(proc, PUT, 1);
1563
SET_COUNTER(ack_cntr[armci_th_idx], 1);
1566
/* fprintf(stderr,"bytes: %d\n",bytes); */
1567
/* this call is blocking, so local count is fine */
1570
BGML_Callback_t cb_wait={wait_callback, &count};
1571
BG1S_Memput(&req, proc, src, 0, dst, bytes, &cb_wait, 1);
1575
armci_put(src, dst, bytes, proc);
1580
SET_COUNTER(get_cntr[armci_th_idx], 1);
1583
/* fprintf(stderr,"before memget\n"); */
1586
BGML_Callback_t cb_wait={wait_callback, &count};
1587
BG1S_Memget(&req, proc, dst, 0, src, bytes, &cb_wait, 1);
1591
armci_get(src, dst, bytes, proc);
1595
/* deal with non-blocking loads and stores */
1596
# if defined(LAPI) || defined(_ELAN_PUTGET_H)
1597
if(proc != armci_me){
1599
WAIT_FOR_GETS; /* wait for data arrival */
1601
WAIT_FOR_PUTS; /* data must be copied out*/
1608
/* non-blocking remote value put/get operation */
1609
static void _armci_nb_rem_value(int op, void *src, void *dst, int proc,
1610
int bytes, armci_ihdl_t nb_handle) {
1612
int armci_th_idx = ARMCI_THREAD_IDX;
1614
if(nb_handle && nb_handle->agg_flag == SET) {
1616
(void)armci_agg_save_descriptor(src,dst,bytes,proc,op,pv,nb_handle);
1620
if(op==PUT) UPDATE_FENCE_INFO(proc);
1622
/*set tag and op in the nb handle*/
1624
nb_handle->tag = GET_NEXT_NBTAG();
1626
nb_handle->proc= proc;
1627
nb_handle->bufid=NB_NONE;
1630
nb_handle = armci_set_implicit_handle(op, proc);
1633
#if defined(REMOTE_OP) && !defined(QUADRICS)
1634
rc = armci_rem_strided(op, NULL, proc, src, NULL, dst, NULL,
1635
&bytes, 0, NULL, 0, nb_handle);
1636
if(rc) armci_die("ARMCI_Value: armci_rem_strided incomplete", FAIL6);
1640
UPDATE_FENCE_STATE(proc, PUT, 1);
1642
SET_COUNTER(ack_cntr[armci_th_idx], 1);
1644
armci_put(src, dst, bytes, proc);
1648
SET_COUNTER(get_cntr[armci_th_idx], 1);
1650
armci_get(src, dst, bytes, proc);
1653
/* deal with non-blocking loads and stores */
1654
# if defined(LAPI) || defined(_ELAN_PUTGET_H)
1659
if(proc != armci_me){
1661
WAIT_FOR_GETS; /* wait for data arrival */
1663
WAIT_FOR_PUTS; /* data must be copied out*/
1672
#define CHK_ERR(dst, proc) \
1673
if(dst==NULL) armci_die("PARMCI_PutValue: NULL pointer passed",FAIL); \
1674
if(proc<0) armci_die("PARMCI_PutValue: Invalid process rank", proc);
1676
#define CHK_ERR_GET(src, dst, proc, bytes) \
1677
if(src==NULL || dst==NULL) armci_die("PARMCI_GetValue: NULL pointer passed",FAIL); \
1678
if(proc<0) armci_die("PARMCI_GetValue: Invalid process rank", proc); \
1679
if(bytes<0) armci_die("PARMCI_GetValue: Invalid size", bytes);
1682
* Register-Originated Put.
1684
int PARMCI_PutValueInt(int src, void *dst, int proc)
1687
if( SAMECLUSNODE(proc) ) *(int *)dst = src;
1688
else _armci_rem_value(PUT, &src, dst, proc, sizeof(int));
1692
int PARMCI_PutValueLong(long src, void *dst, int proc)
1695
if( SAMECLUSNODE(proc) ) *(long *)dst = src;
1696
else _armci_rem_value(PUT, &src, dst, proc, sizeof(long));
1700
int PARMCI_PutValueFloat(float src, void *dst, int proc)
1703
if( SAMECLUSNODE(proc) ) *(float *)dst = src;
1704
else _armci_rem_value(PUT, &src, dst, proc, sizeof(float));
1708
int PARMCI_PutValueDouble(double src, void *dst, int proc)
1711
if( SAMECLUSNODE(proc) ) *(double *)dst = src;
1712
else _armci_rem_value(PUT, &src, dst, proc, sizeof(double));
1717
* Non-Blocking register-originated put.
1719
int PARMCI_NbPutValueInt(int src, void *dst, int proc, armci_hdl_t* usr_hdl)
1722
if( SAMECLUSNODE(proc) ) *(int *)dst = src;
1723
else _armci_nb_rem_value(PUT,&src,dst,proc,sizeof(int),(armci_ihdl_t)usr_hdl);
1727
int PARMCI_NbPutValueLong(long src, void *dst, int proc, armci_hdl_t* usr_hdl)
1730
if( SAMECLUSNODE(proc) ) *(long *)dst = src;
1731
else _armci_nb_rem_value(PUT,&src,dst,proc,sizeof(long),(armci_ihdl_t)usr_hdl);
1735
int PARMCI_NbPutValueFloat(float src, void *dst, int proc, armci_hdl_t* usr_hdl)
1738
if( SAMECLUSNODE(proc) ) *(float *)dst = src;
1739
else _armci_nb_rem_value(PUT,&src,dst,proc,sizeof(float),(armci_ihdl_t)usr_hdl);
1743
int PARMCI_NbPutValueDouble(double src, void *dst, int proc, armci_hdl_t* usr_hdl)
1746
if( SAMECLUSNODE(proc) ) *(double *)dst = src;
1747
else _armci_nb_rem_value(PUT,&src,dst,proc,sizeof(double),(armci_ihdl_t)usr_hdl);
1753
* Register-Originated Get.
1755
int PARMCI_GetValueInt(void *src, int proc)
1758
if( SAMECLUSNODE(proc) ) return *(int *)src;
1759
else _armci_rem_value(GET, src, &dst, proc, sizeof(int));
1763
long PARMCI_GetValueLong(void *src, int proc)
1766
if( SAMECLUSNODE(proc) ) return *(long *)src;
1767
else _armci_rem_value(GET, src, &dst, proc, sizeof(long));
1771
float PARMCI_GetValueFloat(void *src, int proc)
1774
if( SAMECLUSNODE(proc) ) return *(float *)src;
1775
else _armci_rem_value(GET, src, &dst, proc, sizeof(float));
1779
double PARMCI_GetValueDouble(void *src, int proc)
1782
if( SAMECLUSNODE(proc) ) return *(double *)src;
1783
else _armci_rem_value(GET, src, &dst, proc, sizeof(double));
1791
* Register-Originated Get.
1793
int PARMCI_GetValue(void *src, void *dst, int proc, int bytes)
1795
CHK_ERR_GET(src, dst, proc, bytes);
1796
if( SAMECLUSNODE(proc) ) { armci_copy(src, dst, bytes); }
1797
else _armci_rem_value(GET, src, dst, proc, bytes);
1802
* Non-Blocking register-originated get.
1804
int PARMCI_NbGetValue(void *src, void *dst, int proc, int bytes, armci_hdl_t* usr_hdl)
1806
CHK_ERR_GET(src, dst, proc, bytes);
1807
if( SAMECLUSNODE(proc) ) { armci_copy(src, dst, bytes); }
1808
else _armci_nb_rem_value(GET, src, dst, proc, bytes, (armci_ihdl_t)usr_hdl);