1
/* $Id: copy.h,v 1.86.2.6 2007-08-29 17:32:32 manoj Exp $ */
15
#if 1 || defined(HITACHI) || defined(CRAY_T3E) || defined(CRAY_XT) || defined(BGML)
18
#if defined(LINUX64) && defined(SGIALTIX) && defined(MPI)
19
/* fastbcopy from Wayne Vieira and Gerardo Cisneros */
21
#define armci_copy(src, dst, len) _fastbcopy(src, dst, len)
22
#define memcpy(dst, src, len) _fastbcopy(src, dst, len)
23
#define bcopy(src, dst, len) _fastbcopy(src, dst, len)
27
# define EXTERN extern
31
# define memcpy1 _VEC_memcpy
32
# define armci_copy1(src,dst,n) _VEC_memcpy((dst),(src),(n))
33
EXTERN long long _armci_vec_sync_flag;
36
#if defined(SGI) || defined(FUJITSU) || defined(HPUX) || defined(SOLARIS) || defined (DECOSF) || defined(__ia64__) || defined(__crayx1)
40
#if defined(NB_NONCONT) && !defined(CRAY_SHMEM) && !defined(QUADRICS) && !defined(PORTALS)
41
#error NB_NONCONT is only available on CRAY_SHMEM,QUADRICS and PORTALS
44
#if defined(SHMEM_HANDLE_SUPPORTED) && !defined(CRAY_SHMEM)
45
#error SHMEM_HANDLE_SUPPORTED should not be defined on a non CRAY_SHMEM network
48
#if defined(MEMCPY) && !defined(armci_copy)
50
#define armci_copy(src, dst, n) BGLML_memcpy((dst), (src), (n))
52
# define armci_copy(src,dst,n) memcpy((dst), (src), (n))
57
# define MEM_FENCE {mpisx_clear_cache(); _armci_vec_sync_flag=1;mpisx_syncset0_long(&_armci_vec_sync_flag);}
61
# define MEM_FENCE asm ("mb")
64
#if defined(NEED_MEM_SYNC)
66
# define MEM_FENCE {int _dummy=1; _clear_lock((int *)&_dummy,0); }
67
# elif defined(__ia64)
68
# if defined(__GNUC__) && !defined (__INTEL_COMPILER)
69
# define MEM_FENCE __asm__ __volatile__ ("mf" ::: "memory");
70
# else /* Intel Compiler */
71
extern void _armci_ia64_mb();
72
# define MEM_FENCE _armci_ia64_mb();
74
# elif defined(LINUX) && defined(__GNUC__) && defined(__ppc__)
76
__asm__ __volatile__ ("isync" : : : "memory");
82
# define armci_copy(src,dst,n) \
83
do if( ((n) < THRESH1D) || ((n)%ALIGN_SIZE) || \
84
((unsigned long)(src)%ALIGN_SIZE) ||\
85
((unsigned long)(dst)%ALIGN_SIZE)) memcpy((dst),(src),(n));\
86
else{ int _bytes=(n)/sizeof(double); DCOPY1D((double*)(src),(double*)(dst),&_bytes);}\
89
# define armci_copy(src,dst,n) \
90
do if( ((n) < THRESH1D) || ((n)%ALIGN_SIZE) ) memcpy((dst), (src), (n));\
91
else{ int _bytes=(n)/sizeof(double); DCOPY1D((double*)(src),(double*)(dst),&_bytes);}\
96
/****************************** 2D Copy *******************/
100
# define DCopy2D(rows, cols, src_ptr, src_ld, dst_ptr, dst_ld){\
101
int rrows, ldd, lds, ccols;\
106
DCOPY2D(&rrows, &ccols, src_ptr, &lds,dst_ptr,&ldd);\
110
# define DCopy2D(rows, cols, src_ptr, src_ld, dst_ptr, dst_ld){\
111
int j, nbytes = sizeof(double)* rows;\
112
char *ps=src_ptr, *pd=dst_ptr;\
113
for (j = 0; j < cols; j++){\
114
armci_copy(ps, pd, nbytes);\
115
ps += sizeof(double)* src_ld;\
116
pd += sizeof(double)* dst_ld;\
122
# define ByteCopy2D(bytes, count, src_ptr, src_stride, dst_ptr,dst_stride){\
124
char *ps=src_ptr, *pd=dst_ptr;\
125
for (_j = 0; _j < count; _j++){\
126
armci_copy(ps, pd, bytes);\
134
# define armci_put2D(p, bytes,count,src_ptr,src_stride,dst_ptr,dst_stride)\
135
CopyPatchTo(src_ptr, src_stride, dst_ptr, dst_stride, count,bytes, p)
137
# define armci_get2D(p, bytes, count, src_ptr,src_stride,dst_ptr,dst_stride)\
138
CopyPatchFrom(src_ptr, src_stride, dst_ptr, dst_stride,count,bytes,p)
140
#elif defined(HITACHI) || defined(_ELAN_PUTGET_H) && !defined(NB_NONCONT)
142
#if defined(QUADRICS)
144
# define WAIT_FOR_PUTS elan_putWaitAll(elan_base->state,200)
145
# define WAIT_FOR_GETS elan_getWaitAll(elan_base->state,200)
147
# define WAIT_FOR_PUTS armcill_wait_put()
148
# define WAIT_FOR_GETS armcill_wait_get()
149
extern void armcill_wait_put();
150
extern void armcill_wait_get();
154
extern void armcill_put2D(int proc, int bytes, int count,
155
void* src_ptr,int src_stride, void* dst_ptr,int dst_stride);
156
extern void armcill_get2D(int proc, int bytes, int count,
157
void* src_ptr,int src_stride, void* dst_ptr,int dst_stride);
158
# define armci_put2D armcill_put2D
159
# define armci_get2D armcill_get2D
161
#elif defined(NB_NONCONT)
163
extern void armcill_wait_put();
164
extern void armcill_wait_get();
165
# define WAIT_FOR_PUTS armcill_wait_put()
166
# define WAIT_FOR_GETS armcill_wait_get()
168
extern void armcill_put2D(int proc, int bytes, int count,
169
void* src_ptr,int src_stride, void* dst_ptr,int dst_stride);
170
extern void armcill_get2D(int proc, int bytes, int count,
171
void* src_ptr,int src_stride, void* dst_ptr,int dst_stride);
172
# define armci_put2D armcill_put2D
173
# define armci_get2D armcill_get2D
175
# if defined(QUADRICS)
177
# define armcill_nb_put(_dst, _src, _sz, _proc, _hdl)\
178
_hdl = elan_put(elan_base->state,_src,_dst,(size_t)_sz,_proc)
179
# define armcill_nb_get(_dst, _src, _sz, _proc, _hdl)\
180
_hdl = elan_get(elan_base->state,_src,_dst,(size_t)_sz,_proc)
181
# define armcill_nb_wait(_hdl)\
184
# elif defined(CRAY_SHMEM)
186
# define armcill_nb_wait(_hdl)\
188
/*VT:this should be ifdef'ed based on if shmem_handle is defined or not*/
189
# if defined (CRAY_XT)
190
# define armcill_nb_put(_dst, _src, _sz, _proc, _hdl)\
191
shmem_putmem(_dst, _src, (size_t)_sz, _proc)
192
# define armcill_nb_get(_dst, _src, _sz, _proc, _hdl)\
193
shmem_getmem(_dst, _src, (size_t)_sz, _proc)
195
# define armcill_nb_put(_dst, _src, _sz, _proc, _hdl)\
196
_hdl = shmem_putmem_nb(_dst, _src, (size_t)_sz, _proc, &(_hdl))
197
# define armcill_nb_get(_dst, _src, _sz, _proc, _hdl)\
198
_hdl = shmem_getmem_nb(_dst, _src, (size_t)_sz, _proc, &(_hdl))
203
# define armci_put2D(proc,bytes,count,src_ptr,src_stride,dst_ptr,dst_stride){\
205
char *ps=src_ptr, *pd=dst_ptr;\
206
for (_j = 0; _j < count; _j++){\
207
armci_put(ps, pd, bytes, proc);\
214
# define armci_get2D(proc,bytes,count,src_ptr,src_stride,dst_ptr,dst_stride){\
216
char *ps=src_ptr, *pd=dst_ptr;\
217
for (_j = 0; _j < count; _j++){\
218
armci_get(ps, pd, bytes, proc);\
225
/* macros to ensure ordering of consecutive puts or gets following puts */
228
# include "lapidefs.h"
230
#elif defined(_CRAYMPP) || defined(QUADRICS) || defined(__crayx1)\
231
|| defined(CRAY_SHMEM) || defined(PORTALS)
232
#if defined(CRAY) || defined(CRAY_XT)
233
# include <mpp/shmem.h>
242
# define FENCE_NODE(p) {\
243
if(((p)<armci_clus_first)||((p)>armci_clus_last))armci_elan_fence(p);}
244
# define UPDATE_FENCE_STATE(p, op, nissued)
248
# define FENCE_NODE(p) if(cmpl_proc == (p)){\
249
if(((p)<armci_clus_first)||((p)>armci_clus_last))shmem_quiet();\
252
# define FENCE_NODE(p) if(cmpl_proc == (p)){\
253
if(((p)<armci_clus_first)||((p)>armci_clus_last))shmem_quiet(); }
255
# define UPDATE_FENCE_STATE(p, op, nissued) if((op)==PUT) cmpl_proc=(p);
258
# if defined(GM) && defined(ACK_FENCE)
259
extern void armci_gm_fence(int p);
260
# define FENCE_NODE(p) armci_gm_fence(p)
262
# include "bgmldefs.h"
263
# define FENCE_NODE(p) BGML_WaitProc(p)
264
# elif defined(ARMCIX)
265
# define FENCE_NODE(p) ARMCIX_Fence(p)
267
# define FENCE_NODE(p)
269
# define UPDATE_FENCE_STATE(p, op, nissued)
279
# define THRESH1D 512
281
#define ALIGN_SIZE sizeof(double)
283
/********* interface to C 1D and 2D memory copy functions ***********/
284
/* dcopy2d_u_ uses explicit unrolled loops to depth 4 */
285
void c_dcopy2d_n_(const int* const restrict rows,
286
const int* const restrict cols,
287
const double* const restrict A,
288
const int* const restrict ald,
289
double* const restrict B,
290
const int* const restrict bld);
291
void c_dcopy2d_u_(const int* const restrict rows,
292
const int* const restrict cols,
293
const double* const restrict A,
294
const int* const restrict ald,
295
double* const restrict B,
296
const int* const restrict bld);
297
void c_dcopy1d_n_(const double* const restrict A,
298
double* const restrict B,
299
const int* const restrict n);
300
void c_dcopy1d_u_(const double* const restrict A,
301
double* const restrict B,
302
const int* const restrict n);
303
void c_dcopy21_(const int* const restrict rows,
304
const int* const restrict cols,
305
const double* const restrict A,
306
const int* const restrict ald,
307
double* const restrict buf,
308
int* const restrict cur);
309
void c_dcopy12_(const int* const restrict rows,
310
const int* const restrict cols,
311
double* const restrict A,
312
const int* const restrict ald,
313
const double* const restrict buf,
314
int* const restrict cur);
315
void c_dcopy31_(const int* const restrict rows,
316
const int* const restrict cols,
317
const int* const restrict plns,
318
const double* const restrict A,
319
const int* const restrict aldr,
320
const int* const restrict aldc,
321
double* const restrict buf,
322
int* const restrict cur);
323
void c_dcopy13_(const int* const restrict rows,
324
const int* const restrict cols,
325
const int* const restrict plns,
326
double* const restrict A,
327
const int* const restrict aldr,
328
const int* const restrict aldc,
329
const double* const restrict buf,
330
int* const restrict cur);
332
#if defined(AIX) || defined(BGML)
333
# define DCOPY2D c_dcopy2d_u_
334
# define DCOPY1D c_dcopy1d_u_
335
#elif defined(LINUX) || defined(__crayx1) || defined(HPUX64) || defined(DECOSF) || defined(CRAY) || defined(WIN32) || defined(HITACHI)
336
# define DCOPY2D c_dcopy2d_n_
337
# define DCOPY1D c_dcopy1d_n_
339
# define DCOPY2D c_dcopy2d_u_
340
# define DCOPY1D c_dcopy1d_u_
342
#define DCOPY21 c_dcopy21_
343
#define DCOPY12 c_dcopy12_
344
#define DCOPY31 c_dcopy31_
345
#define DCOPY13 c_dcopy13_
348
/***************************** 1-Dimensional copy ************************/
349
#if defined(QUADRICS)
350
# include <elan/elan.h>
352
# if defined(_ELAN_PUTGET_H)
353
# define qsw_put(src,dst,n,proc) \
354
elan_wait(elan_put(elan_base->state,src,dst,n,proc),elan_base->waitType)
355
# define qsw_get(src,dst,n,proc) \
356
elan_wait(elan_get(elan_base->state,src,dst,n,proc),elan_base->waitType)
358
# define ARMCI_NB_PUT(src,dst,n,proc,phandle)\
359
*(phandle)=elan_put(elan_base->state,src,dst,n,proc)
362
extern void armci_elan_put_with_tracknotify(char *src,char *dst,int n,int proc, ELAN_EVENT **phandle);
363
# define ARMCI_NB_PUT(src,dst,n,proc,phandle)\
364
armci_elan_put_with_tracknotify(src,dst,n,proc,phandle)
367
# define ARMCI_NB_GET(src,dst,n,proc,phandle)\
368
*(phandle)=elan_get(elan_base->state,src,dst,n,proc)
369
# define ARMCI_NB_WAIT(handle) if(handle)elan_wait(handle,elan_base->waitType)
370
# define ARMCI_NB_TEST(handle,_succ) (*(_succ))= (handle)? !elan_poll(handle,1L): 1
372
# define qsw_put(src,dst,n,proc) shmem_putmem((dst),(src),(int)(n),(proc))
373
# define qsw_get(src,dst,n,proc) shmem_getmem((dst),(src),(int)(n),(proc))
376
# define armci_put(src,dst,n,proc)\
377
if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\
378
armci_copy(src,dst,n);\
379
} else { qsw_put(src,dst,n,proc);}
380
# define armci_get(src,dst,n,proc) \
381
if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\
382
armci_copy(src,dst,n);\
383
} else { qsw_get((src),(dst),(int)(n),(proc));}
385
#elif defined(CRAY_T3E) || defined(CRAY_SHMEM)
386
# define armci_copy_disabled(src,dst,n)\
387
if((n)<256 || n%sizeof(long) ) memcpy((dst),(src),(n));\
389
shmem_put((long*)(dst),(long*)(src),(int)(n)/sizeof(long),armci_me);\
392
# define armci_put(src,dst,n,proc) \
393
shmem_put32((void *)(dst),(void *)(src),(int)(n)/4,(proc));\
396
# define armci_get(src,dst,n,proc) \
397
shmem_get32((void *)(dst),(void *)(src),(int)(n)/4,(proc));\
400
#elif defined(HITACHI)
402
extern void armcill_put(void *src, void *dst, int bytes, int proc);
403
extern void armcill_get(void *src, void *dst, int bytes, int proc);
405
# define armci_put(src,dst,n,proc) \
406
if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\
407
armci_copy(src,dst,n);\
408
} else { armcill_put((src), (dst),(n),(proc));}
410
# define armci_get(src,dst,n,proc)\
411
if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\
412
armci_copy(src,dst,n);\
413
} else { armcill_get((src), (dst),(n),(proc));}
415
#elif defined(FUJITSU)
417
# include "fujitsu-vpp.h"
419
# define armci_copy(src,dst,n) _MmCopy((char*)(dst), (char*)(src), (n))
421
# define armci_put CopyTo
422
# define armci_get CopyFrom
427
extern lapi_handle_t lapi_handle;
429
# define armci_put(src,dst,n,proc)\
431
armci_copy(src,dst,n);\
433
if(LAPI_Put(lapi_handle, (uint)proc, (uint)n, (dst), (src),\
434
NULL,&(ack_cntr[ARMCI_THREAD_IDX].cntr),&cmpl_arr[proc].cntr))\
435
ARMCI_Error("LAPI_put failed",0); else;}
437
/**** this copy is nonblocking and requires fence to complete!!! ****/
438
# define armci_get(src,dst,n,proc) \
440
armci_copy(src,dst,n);\
442
if(LAPI_Get(lapi_handle, (uint)proc, (uint)n, (src), (dst), \
443
NULL, &(get_cntr[ARMCI_THREAD_IDX].cntr)))\
444
ARMCI_Error("LAPI_Get failed",0);else;}
446
# define ARMCI_NB_PUT(src,dst,n,proc,cmplt)\
447
{if(LAPI_Setcntr(lapi_handle, &((cmplt)->cntr), 0))\
448
ARMCI_Error("LAPI_Setcntr in NB_PUT failed",0);\
450
if(LAPI_Put(lapi_handle, (uint)proc, (uint)n, (dst), (src),\
451
NULL, &((cmplt)->cntr), &cmpl_arr[proc].cntr))\
452
ARMCI_Error("LAPI_put failed",0); else;}
454
# define ARMCI_NB_GET(src,dst,n,proc,cmplt)\
455
{if(LAPI_Setcntr(lapi_handle, &((cmplt)->cntr), 0))\
456
ARMCI_Error("LAPI_Setcntr in NB_GET failed",0);\
458
if(LAPI_Get(lapi_handle, (uint)proc, (uint)n, (src), (dst), \
459
NULL, &((cmplt)->cntr)))\
460
ARMCI_Error("LAPI_Get NB_GET failed",0);else;}
462
# define ARMCI_NB_WAIT(cmplt) CLEAR_COUNTER((cmplt))
463
# define ARMCI_NB_TEST(cmplt,_succ) TEST_COUNTER((cmplt),(_succ))
465
#elif defined(PORTALS)
466
# define armci_put(src,dst,n,proc) \
467
if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\
468
armci_copy(src,dst,n);\
469
} else { PARMCI_Put((src), (dst),(n),(proc));}
471
# define armci_get(src,dst,n,proc)\
472
if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\
473
armci_copy(src,dst,n);\
474
} else { PARMCI_Get((src), (dst),(n),(proc));}
477
# define ARMCI_NB_PUT(src,dst,n,proc,cmplt)\
478
nb_handle->tag=GET_NEXT_NBTAG();armci_portals_put((proc),(src),\
479
(dst),(n),cmplt,nb_handle->tag)
480
# define ARMCI_NB_GET(src,dst,n,proc,cmplt)\
481
nb_handle->tag=GET_NEXT_NBTAG();armci_portals_get((proc),(src),\
482
(dst),(n),cmplt,nb_handle->tag)
486
#define armci_get(src, dst, n, p) PARMCI_Get(src, dst, n, p)
487
#define armci_put(src, dst, n, p) PARMCI_Put(src, dst, n, p)
489
#elif defined(ARMCIX)
490
#define armci_get(src, dst, n, p) PARMCI_Get(src, dst, n, p)
491
#define armci_put(src, dst, n, p) PARMCI_Put(src, dst, n, p)
492
#define ARMCI_NB_WAIT(cmplt) ARMCIX_Wait(&(cmplt))
495
# define armci_get(src,dst,n,p) armci_copy((src),(dst),(n))
496
# define armci_put(src,dst,n,p) armci_copy((src),(dst),(n))
501
# define MEM_FENCE {}
503
#ifndef armci_copy_fence
504
# define armci_copy_fence armci_copy