1
/* $Id: copy.h,v 1.86.2.6 2007-08-29 17:32:32 manoj Exp $ */
16
# define EXTERN extern
20
#if defined(SGI) || defined(FUJITSU) || defined(HPUX) || defined(SOLARIS) || defined (DECOSF) || defined(__ia64__) || defined(__crayx1)
24
#if defined(NB_NONCONT) && !defined(CRAY_SHMEM) && !defined(QUADRICS)
25
#error NB_NONCONT is only available on CRAY_SHMEM,QUADRICS and PORTALS
28
#if defined(SHMEM_HANDLE_SUPPORTED) && !defined(CRAY_SHMEM)
29
#error SHMEM_HANDLE_SUPPORTED should not be defined on a non CRAY_SHMEM network
32
/* 08/30/06 moved up here from lines 252-397, MEM_FENCE before FENCE_NODE */
34
#if defined(NEED_MEM_SYNC)
36
# define MEM_FENCE {int _dummy=1; _clear_lock((int *)&_dummy,0); }
37
# elif defined(__ia64)
38
# if defined(__GNUC__) && !defined (__INTEL_COMPILER)
39
# define MEM_FENCE __asm__ __volatile__ ("mf" ::: "memory");
40
# else /* Intel Compiler */
41
extern void _armci_ia64_mb();
42
# define MEM_FENCE _armci_ia64_mb();
44
# elif defined(LINUX) && defined(__GNUC__) && defined(__ppc__)
46
__asm__ __volatile__ ("isync" : : : "memory");
51
# define armci_copy(src,dst,n) bcopy(src,dst,n)
54
/****************************** 2D Copy *******************/
57
# define DCopy2D(rows, cols, src_ptr, src_ld, dst_ptr, dst_ld){\
58
int j, nbytes = sizeof(double)* rows;\
59
char *ps=src_ptr, *pd=dst_ptr;\
60
for (j = 0; j < cols; j++){\
61
armci_copy(ps, pd, nbytes);\
62
ps += sizeof(double)* src_ld;\
63
pd += sizeof(double)* dst_ld;\
68
# define ByteCopy2D(bytes, count, src_ptr, src_stride, dst_ptr,dst_stride){\
70
char *ps=src_ptr, *pd=dst_ptr;\
71
for (_j = 0; _j < count; _j++){\
72
armci_copy(ps, pd, bytes);\
80
# define armci_put2D(p, bytes,count,src_ptr,src_stride,dst_ptr,dst_stride)\
81
CopyPatchTo(src_ptr, src_stride, dst_ptr, dst_stride, count,bytes, p)
83
# define armci_get2D(p, bytes, count, src_ptr,src_stride,dst_ptr,dst_stride)\
84
CopyPatchFrom(src_ptr, src_stride, dst_ptr, dst_stride,count,bytes,p)
86
#elif defined(HITACHI) || defined(_ELAN_PUTGET_H) && !defined(NB_NONCONT)
90
# define WAIT_FOR_PUTS elan_putWaitAll(elan_base->state,200)
91
# define WAIT_FOR_GETS elan_getWaitAll(elan_base->state,200)
93
# define WAIT_FOR_PUTS armcill_wait_put()
94
# define WAIT_FOR_GETS armcill_wait_get()
95
extern void armcill_wait_put();
96
extern void armcill_wait_get();
100
extern void armcill_put2D(int proc, int bytes, int count,
101
void* src_ptr,int src_stride, void* dst_ptr,int dst_stride);
102
extern void armcill_get2D(int proc, int bytes, int count,
103
void* src_ptr,int src_stride, void* dst_ptr,int dst_stride);
104
# define armci_put2D armcill_put2D
105
# define armci_get2D armcill_get2D
107
#elif defined(NB_NONCONT)
109
extern void armcill_wait_put();
110
extern void armcill_wait_get();
111
# define WAIT_FOR_PUTS armcill_wait_put()
112
# define WAIT_FOR_GETS armcill_wait_get()
114
extern void armcill_put2D(int proc, int bytes, int count,
115
void* src_ptr,int src_stride, void* dst_ptr,int dst_stride);
116
extern void armcill_get2D(int proc, int bytes, int count,
117
void* src_ptr,int src_stride, void* dst_ptr,int dst_stride);
118
# define armci_put2D armcill_put2D
119
# define armci_get2D armcill_get2D
121
# if defined(QUADRICS)
123
# define armcill_nb_put(_dst, _src, _sz, _proc, _hdl)\
124
_hdl = elan_put(elan_base->state,_src,_dst,(size_t)_sz,_proc)
125
# define armcill_nb_get(_dst, _src, _sz, _proc, _hdl)\
126
_hdl = elan_get(elan_base->state,_src,_dst,(size_t)_sz,_proc)
127
# define armcill_nb_wait(_hdl)\
130
# elif defined(CRAY_SHMEM)
132
# define armcill_nb_wait(_hdl)\
134
/*VT:this should be ifdef'ed based on if shmem_handle is defined or not*/
135
# if defined (CRAY_XT)
136
# define armcill_nb_put(_dst, _src, _sz, _proc, _hdl)\
137
shmem_putmem(_dst, _src, (size_t)_sz, _proc)
138
# define armcill_nb_get(_dst, _src, _sz, _proc, _hdl)\
139
shmem_getmem(_dst, _src, (size_t)_sz, _proc)
141
# define armcill_nb_put(_dst, _src, _sz, _proc, _hdl)\
142
_hdl = shmem_putmem_nb(_dst, _src, (size_t)_sz, _proc, &(_hdl))
143
# define armcill_nb_get(_dst, _src, _sz, _proc, _hdl)\
144
_hdl = shmem_getmem_nb(_dst, _src, (size_t)_sz, _proc, &(_hdl))
149
# define armci_put2D(proc,bytes,count,src_ptr,src_stride,dst_ptr,dst_stride){\
151
char *ps=src_ptr, *pd=dst_ptr;\
152
for (_j = 0; _j < count; _j++){\
153
armci_put(ps, pd, bytes, proc);\
160
# define armci_get2D(proc,bytes,count,src_ptr,src_stride,dst_ptr,dst_stride){\
162
char *ps=src_ptr, *pd=dst_ptr;\
163
for (_j = 0; _j < count; _j++){\
164
armci_get(ps, pd, bytes, proc);\
171
/* macros to ensure ordering of consecutive puts or gets following puts */
174
# include "lapidefs.h"
176
#elif defined(_CRAYMPP) || defined(QUADRICS) || defined(__crayx1)\
177
|| defined(CRAY_SHMEM)
178
#if defined(CRAY) || defined(CRAY_XT)
179
# include <mpp/shmem.h>
188
# define FENCE_NODE(p) {\
189
if(((p)<armci_clus_first)||((p)>armci_clus_last))armci_elan_fence(p);}
190
# define UPDATE_FENCE_STATE(p, op, nissued)
194
# define FENCE_NODE(p) if(cmpl_proc == (p)){\
195
if(((p)<armci_clus_first)||((p)>armci_clus_last))shmem_quiet();\
198
# define FENCE_NODE(p) if(cmpl_proc == (p)){\
199
if(((p)<armci_clus_first)||((p)>armci_clus_last))shmem_quiet(); }
201
# define UPDATE_FENCE_STATE(p, op, nissued) if((op)==PUT) cmpl_proc=(p);
204
# if defined(GM) && defined(ACK_FENCE)
205
extern void armci_gm_fence(int p);
206
# define FENCE_NODE(p) armci_gm_fence(p)
208
# include "bgmldefs.h"
209
# define FENCE_NODE(p) BGML_WaitProc(p)
211
# define FENCE_NODE(p)
213
# define UPDATE_FENCE_STATE(p, op, nissued)
223
# define THRESH1D 512
225
#define ALIGN_SIZE sizeof(double)
227
/********* interface to C 1D and 2D memory copy functions ***********/
228
/* dcopy2d_u_ uses explicit unrolled loops to depth 4 */
229
void c_dcopy2d_n_(const int* const restrict rows,
230
const int* const restrict cols,
231
const double* const restrict A,
232
const int* const restrict ald,
233
double* const restrict B,
234
const int* const restrict bld);
235
void c_dcopy2d_u_(const int* const restrict rows,
236
const int* const restrict cols,
237
const double* const restrict A,
238
const int* const restrict ald,
239
double* const restrict B,
240
const int* const restrict bld);
241
void c_dcopy1d_n_(const double* const restrict A,
242
double* const restrict B,
243
const int* const restrict n);
244
void c_dcopy1d_u_(const double* const restrict A,
245
double* const restrict B,
246
const int* const restrict n);
247
void c_dcopy21_(const int* const restrict rows,
248
const int* const restrict cols,
249
const double* const restrict A,
250
const int* const restrict ald,
251
double* const restrict buf,
252
int* const restrict cur);
253
void c_dcopy12_(const int* const restrict rows,
254
const int* const restrict cols,
255
double* const restrict A,
256
const int* const restrict ald,
257
const double* const restrict buf,
258
int* const restrict cur);
259
void c_dcopy31_(const int* const restrict rows,
260
const int* const restrict cols,
261
const int* const restrict plns,
262
const double* const restrict A,
263
const int* const restrict aldr,
264
const int* const restrict aldc,
265
double* const restrict buf,
266
int* const restrict cur);
267
void c_dcopy13_(const int* const restrict rows,
268
const int* const restrict cols,
269
const int* const restrict plns,
270
double* const restrict A,
271
const int* const restrict aldr,
272
const int* const restrict aldc,
273
const double* const restrict buf,
274
int* const restrict cur);
276
/********* interface to fortran 1D and 2D memory copy functions ***********/
279
# define ATR __stdcall
283
# define dcopy2d_n_ F77_FUNC_(dcopy2d_n,DCOPY2D_N)
284
# define dcopy2d_u_ F77_FUNC_(dcopy2d_u,DCOPY2D_U)
285
# define dcopy1d_n_ F77_FUNC_(dcopy1d_n,DCOPY1D_N)
286
# define dcopy1d_u_ F77_FUNC_(dcopy1d_u,DCOPY1D_U)
287
# define dcopy21_ F77_FUNC(dcopy21,DCOPY21)
288
# define dcopy12_ F77_FUNC(dcopy12,DCOPY12)
289
# define dcopy31_ F77_FUNC(dcopy31,DCOPY31)
290
# define dcopy13_ F77_FUNC(dcopy13,DCOPY13)
291
void ATR dcopy2d_n_(const int* const restrict rows,
292
const int* const restrict cols,
293
const double* const restrict A,
294
const int* const restrict ald,
295
double* const restrict B,
296
const int* const restrict bld);
297
void ATR dcopy2d_u_(const int* const restrict rows,
298
const int* const restrict cols,
299
const double* const restrict A,
300
const int* const restrict ald,
301
double* const restrict B,
302
const int* const restrict bld);
303
void ATR dcopy1d_n_(const double* const restrict A,
304
double* const restrict B,
305
const int* const restrict n);
306
void ATR dcopy1d_u_(const double* const restrict A,
307
double* const restrict B,
308
const int* const restrict n);
309
void ATR dcopy21_(const int* const restrict rows,
310
const int* const restrict cols,
311
const double* const restrict A,
312
const int* const restrict ald,
313
double* const restrict buf,
314
int* const restrict cur);
315
void ATR dcopy12_(const int* const restrict rows,
316
const int* const restrict cols,
317
double* const restrict A,
318
const int* const restrict ald,
319
const double* const restrict buf,
320
int* const restrict cur);
321
void ATR dcopy31_(const int* const restrict rows,
322
const int* const restrict cols,
323
const int* const restrict plns,
324
const double* const restrict A,
325
const int* const restrict aldr,
326
const int* const restrict aldc,
327
double* const restrict buf,
328
int* const restrict cur);
329
void ATR dcopy13_(const int* const restrict rows,
330
const int* const restrict cols,
331
const int* const restrict plns,
332
double* const restrict A,
333
const int* const restrict aldr,
334
const int* const restrict aldc,
335
const double* const restrict buf,
336
int* const restrict cur);
340
# if defined(AIX) || defined(BGML)
341
# define DCOPY2D c_dcopy2d_u_
342
# define DCOPY1D c_dcopy1d_u_
343
# elif defined(LINUX) || defined(__crayx1) || defined(HPUX64) || defined(DECOSF) || defined(CRAY) || defined(WIN32) || defined(HITACHI)
344
# define DCOPY2D c_dcopy2d_n_
345
# define DCOPY1D c_dcopy1d_n_
347
# define DCOPY2D c_dcopy2d_u_
348
# define DCOPY1D c_dcopy1d_u_
350
# define DCOPY21 c_dcopy21_
351
# define DCOPY12 c_dcopy12_
352
# define DCOPY31 c_dcopy31_
353
# define DCOPY13 c_dcopy13_
355
# if defined(AIX) || defined(BGML)
356
# define DCOPY2D dcopy2d_u_
357
# define DCOPY1D dcopy1d_u_
358
# elif defined(LINUX) || defined(__crayx1) || defined(HPUX64) || defined(DECOSF) || defined(CRAY) || defined(WIN32) || defined(HITACHI)
359
# define DCOPY2D dcopy2d_n_
360
# define DCOPY1D dcopy1d_n_
362
# define DCOPY2D dcopy2d_u_
363
# define DCOPY1D dcopy1d_u_
365
# define DCOPY21 dcopy21_
366
# define DCOPY12 dcopy12_
367
# define DCOPY31 dcopy31_
368
# define DCOPY13 dcopy13_
372
/***************************** 1-Dimensional copy ************************/
373
#if defined(QUADRICS)
374
# include <elan/elan.h>
376
# if defined(_ELAN_PUTGET_H)
377
# define qsw_put(src,dst,n,proc) \
378
elan_wait(elan_put(elan_base->state,src,dst,n,proc),elan_base->waitType)
379
# define qsw_get(src,dst,n,proc) \
380
elan_wait(elan_get(elan_base->state,src,dst,n,proc),elan_base->waitType)
382
# define ARMCI_NB_PUT(src,dst,n,proc,phandle)\
383
*(phandle)=elan_put(elan_base->state,src,dst,n,proc)
386
extern void armci_elan_put_with_tracknotify(char *src,char *dst,int n,int proc, ELAN_EVENT **phandle);
387
# define ARMCI_NB_PUT(src,dst,n,proc,phandle)\
388
armci_elan_put_with_tracknotify(src,dst,n,proc,phandle)
391
# define ARMCI_NB_GET(src,dst,n,proc,phandle)\
392
*(phandle)=elan_get(elan_base->state,src,dst,n,proc)
393
# define ARMCI_NB_WAIT(handle) if(handle)elan_wait(handle,elan_base->waitType)
394
# define ARMCI_NB_TEST(handle,_succ) (*(_succ))= (handle)? !elan_poll(handle,1L): 1
396
# define qsw_put(src,dst,n,proc) shmem_putmem((dst),(src),(int)(n),(proc))
397
# define qsw_get(src,dst,n,proc) shmem_getmem((dst),(src),(int)(n),(proc))
400
# define armci_put(src,dst,n,proc)\
401
if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\
402
armci_copy(src,dst,n);\
403
} else { qsw_put(src,dst,n,proc);}
404
# define armci_get(src,dst,n,proc) \
405
if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\
406
armci_copy(src,dst,n);\
407
} else { qsw_get((src),(dst),(int)(n),(proc));}
409
#elif defined(CRAY_T3E) || defined(CRAY_SHMEM)
410
# define armci_copy_disabled(src,dst,n)\
411
if((n)<256 || n%sizeof(long) ) memcpy((dst),(src),(n));\
413
shmem_put((long*)(dst),(long*)(src),(int)(n)/sizeof(long),armci_me);\
416
# define armci_put(src,dst,n,proc) \
417
shmem_put32((void *)(dst),(void *)(src),(int)(n)/4,(proc));\
420
# define armci_get(src,dst,n,proc) \
421
shmem_get32((void *)(dst),(void *)(src),(int)(n)/4,(proc));\
424
#elif defined(HITACHI)
426
extern void armcill_put(void *src, void *dst, int bytes, int proc);
427
extern void armcill_get(void *src, void *dst, int bytes, int proc);
429
# define armci_put(src,dst,n,proc) \
430
if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\
431
armci_copy(src,dst,n);\
432
} else { armcill_put((src), (dst),(n),(proc));}
434
# define armci_get(src,dst,n,proc)\
435
if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\
436
armci_copy(src,dst,n);\
437
} else { armcill_get((src), (dst),(n),(proc));}
439
#elif defined(FUJITSU)
441
# include "fujitsu-vpp.h"
443
# define armci_copy(src,dst,n) _MmCopy((char*)(dst), (char*)(src), (n))
445
# define armci_put CopyTo
446
# define armci_get CopyFrom
451
extern lapi_handle_t lapi_handle;
453
# define armci_put(src,dst,n,proc)\
455
armci_copy(src,dst,n);\
457
if(LAPI_Put(lapi_handle, (uint)proc, (uint)n, (dst), (src),\
458
NULL,&(ack_cntr[ARMCI_THREAD_IDX].cntr),&cmpl_arr[proc].cntr))\
459
ARMCI_Error("LAPI_put failed",0); else;}
461
/**** this copy is nonblocking and requires fence to complete!!! ****/
462
# define armci_get(src,dst,n,proc) \
464
armci_copy(src,dst,n);\
466
if(LAPI_Get(lapi_handle, (uint)proc, (uint)n, (src), (dst), \
467
NULL, &(get_cntr[ARMCI_THREAD_IDX].cntr)))\
468
ARMCI_Error("LAPI_Get failed",0);else;}
470
# define ARMCI_NB_PUT(src,dst,n,proc,cmplt)\
471
{if(LAPI_Setcntr(lapi_handle, &((cmplt)->cntr), 0))\
472
ARMCI_Error("LAPI_Setcntr in NB_PUT failed",0);\
474
if(LAPI_Put(lapi_handle, (uint)proc, (uint)n, (dst), (src),\
475
NULL, &((cmplt)->cntr), &cmpl_arr[proc].cntr))\
476
ARMCI_Error("LAPI_put failed",0); else;}
478
# define ARMCI_NB_GET(src,dst,n,proc,cmplt)\
479
{if(LAPI_Setcntr(lapi_handle, &((cmplt)->cntr), 0))\
480
ARMCI_Error("LAPI_Setcntr in NB_GET failed",0);\
482
if(LAPI_Get(lapi_handle, (uint)proc, (uint)n, (src), (dst), \
483
NULL, &((cmplt)->cntr)))\
484
ARMCI_Error("LAPI_Get NB_GET failed",0);else;}
486
# define ARMCI_NB_WAIT(cmplt) CLEAR_COUNTER((cmplt))
487
# define ARMCI_NB_TEST(cmplt,_succ) TEST_COUNTER((cmplt),(_succ))
489
#elif defined(PORTALS)
490
# define armci_put(src,dst,n,proc) \
491
if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\
492
armci_copy(src,dst,n);\
493
} else { armci_portals_put((proc),(src), (dst),(n),NULL,0);}
495
# define armci_get(src,dst,n,proc)\
496
if(((proc)<=armci_clus_last) && ((proc>= armci_clus_first))){\
497
armci_copy(src,dst,n);\
498
} else { armci_portals_get((proc),(src), (dst),(n),NULL,0);}
500
# define ARMCI_NB_PUT(src,dst,n,proc,cmplt)\
501
nb_handle->tag=GET_NEXT_NBTAG();armci_portals_put((proc),(src),\
502
(dst),(n),cmplt,nb_handle->tag)
503
# define ARMCI_NB_GET(src,dst,n,proc,cmplt)\
504
nb_handle->tag=GET_NEXT_NBTAG();armci_portals_get((proc),(src),\
505
(dst),(n),cmplt,nb_handle->tag)
508
#define armci_get(src, dst, n, p) PARMCI_Get(src, dst, n, p)
509
#define armci_put(src, dst, n, p) PARMCI_Put(src, dst, n, p)
513
# define armci_get(src,dst,n,p) armci_copy((src),(dst),(n))
514
# define armci_put(src,dst,n,p) armci_copy((src),(dst),(n))
521
#ifndef armci_copy_fence
522
# define armci_copy_fence armci_copy