5
/* $Id: ghosts.c,v 1.47.4.2 2007-05-02 16:23:39 d3g293 Exp $ */
9
* description: implements GA collective communication operations to
10
* update ghost cell regions.
14
* This material was prepared as an account of work sponsored by an
15
* agency of the United States Government. Neither the United States
16
* Government nor the United States Department of Energy, nor Battelle,
17
* nor any of their employees, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
18
* ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
19
* COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
20
* SOFTWARE, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT
21
* INFRINGE PRIVATELY OWNED RIGHTS.
26
* This software and its documentation were produced with United States
27
* Government support under Contract Number DE-AC06-76RLO-1830 awarded by
28
* the United States Department of Energy. The United States Government
29
* retains a paid-up non-exclusive, irrevocable worldwide license to
30
* reproduce, prepare derivative works, perform publicly and display
31
* publicly by or for the US Government, including the right to
32
* distribute to other US Government contractors.
36
/*#define PERMUTE_PIDS */
62
/* from armcip.h, but armcip.h is private so we should not include it */
63
extern void armci_write_strided(void *ptr, int stride_levels, int stride_arr[], int count[], char *buf);
64
extern void armci_read_strided(void *ptr, int stride_levels, int stride_arr[], int count[], char *buf);
67
#define INVALID_MA_HANDLE -1
68
#define NEAR_INT(x) (x)< 0.0 ? ceil( (x) - 0.5) : floor((x) + 0.5)
70
#if !defined(CRAY_YMP)
71
#define BYTE_ADDRESSABLE_MEMORY
74
/*uncomment line below to verify consistency of MA in every sync */
75
/*#define CHECK_MA yes */
77
/***************************************************************************/
79
/*\ Return a pointer to the location indicated by subscript and and an array
80
* of leading dimensions (ld). Assume that subscript refers to a set of local
81
* coordinates relative to the origin of the array and account for the
82
* presence of ghost cells.
84
#define gam_LocationWithGhosts(proc, handle, subscript, ptr_loc, ld) \
86
Integer _d, _factor = 1, _last=GA[handle].ndim - 1, _offset=0; \
87
Integer _lo[MAXDIM], _hi[MAXDIM]; \
88
ga_ownsM(handle, proc, _lo, _hi); \
89
if (_last == 0) ld[0] = _hi[0] - _lo[0] + 1 + 2*(Integer)GA[handle].width[0];\
90
for (_d = 0; _d < _last; _d++) { \
91
_offset += subscript[_d] * _factor; \
92
ld[_d] = _hi[_d] - _lo[_d] + 1 + 2*(Integer)GA[handle].width[_d]; \
95
_offset += subscript[_last] * _factor; \
96
*(ptr_loc) = GA[handle].ptr[proc] + _offset*GA[handle].elemsize; \
99
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
100
# pragma weak wnga_access_ghost_ptr = pnga_access_ghost_ptr
102
void pnga_access_ghost_ptr(Integer g_a, Integer dims[],
103
void* ptr, Integer ld[])
107
Integer handle = GA_OFFSET + g_a;
108
Integer i, lo[MAXDIM], hi[MAXDIM];
109
Integer ndim = GA[handle].ndim;
110
Integer me = pnga_nodeid();
112
GA_PUSH_NAME("pnga_access_ghost_ptr");
114
pnga_distribution(g_a, me, lo, hi);
116
for (i=0; i < ndim; i++) {
120
gam_LocationWithGhosts(me, handle, dims, &lptr, ld);
122
for (i=0; i < ndim; i++)
123
dims[i] = hi[i] - lo[i] + 1 + 2*(Integer)GA[handle].width[i];
127
/*\ PROVIDE INDEX TO LOCALLY HELD DATA, ACCOUNTING FOR
128
* PRESENCE OF GHOST CELLS
130
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
131
# pragma weak wnga_access_ghost_element = pnga_access_ghost_element
133
void pnga_access_ghost_element(Integer g_a, AccessIndex* index,
134
Integer subscript[], Integer ld[])
137
Integer handle = GA_OFFSET + g_a;
139
Integer tmp_sub[MAXDIM];
140
unsigned long elemsize=0;
141
unsigned long lref=0, lptr=0;
142
Integer me = pnga_nodeid();
143
GA_PUSH_NAME("nga_access_ghost_element");
144
/* Indices conform to Fortran convention. Shift them down 1 so that
145
gam_LocationWithGhosts works. */
146
for (i=0; i<GA[handle].ndim; i++) tmp_sub[i] = subscript[i] - 1;
147
gam_LocationWithGhosts(me, handle, tmp_sub, &ptr, ld);
149
* return patch address as the distance elements from the reference address
151
* .in Fortran we need only the index to the type array: dbl_mb or int_mb
152
* that are elements of COMMON in the the mafdecls.h include file
153
* .in C we need both the index and the pointer
156
elemsize = (unsigned long)GA[handle].elemsize;
158
/* compute index and check if it is correct */
159
switch (pnga_type_c2f(GA[handle].type)){
161
*index = (AccessIndex) ((DoublePrecision*)ptr - DBL_MB);
162
lref = (unsigned long)DBL_MB;
166
*index = (AccessIndex) ((DoubleComplex*)ptr - DCPL_MB);
167
lref = (unsigned long)DCPL_MB;
171
*index = (AccessIndex) ((SingleComplex*)ptr - SCPL_MB);
172
lref = (unsigned long)SCPL_MB;
176
*index = (AccessIndex) ((Integer*)ptr - INT_MB);
177
lref = (unsigned long)INT_MB;
181
*index = (AccessIndex) ((float*)ptr - FLT_MB);
182
lref = (unsigned long)FLT_MB;
186
#ifdef BYTE_ADDRESSABLE_MEMORY
187
/* check the allignment */
188
lptr = (unsigned long)ptr;
189
if( lptr%elemsize != lref%elemsize ){
190
printf("%d: lptr=%lu(%lu) lref=%lu(%lu)\n",(int)GAme,lptr,lptr%elemsize,
192
pnga_error("nga_access: MA addressing problem: base address misallignment",
197
/* adjust index for Fortran addressing */
204
/*\ PROVIDE POINTER TO LOCALLY HELD DATA, ACCOUNTING FOR
205
* PRESENCE OF GHOST CELLS
207
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
208
# pragma weak wnga_access_ghost_element_ptr = pnga_access_ghost_element_ptr
210
void pnga_access_ghost_element_ptr(Integer g_a, void *ptr,
211
Integer subscript[], Integer ld[])
214
Integer handle = GA_OFFSET + g_a;
216
Integer tmp_sub[MAXDIM];
217
Integer me = pnga_nodeid();
218
GA_PUSH_NAME("nga_access_ghost_element_ptr");
219
/* Indices conform to Fortran convention. Shift them down 1 so that
220
gam_LocationWithGhosts works. */
221
for (i=0; i<GA[handle].ndim; i++) tmp_sub[i] = subscript[i] - 1;
222
gam_LocationWithGhosts(me, handle, tmp_sub, &lptr, ld);
228
/*\ PROVIDE ACCESS TO LOCAL PATCH OF A GLOBAL ARRAY WITH GHOST CELLS
230
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
231
# pragma weak wnga_access_ghosts = pnga_access_ghosts
233
void pnga_access_ghosts(Integer g_a, Integer dims[],
234
AccessIndex* index, Integer ld[])
237
Integer handle = GA_OFFSET + g_a;
238
unsigned long elemsize=0;
239
unsigned long lref=0, lptr=0;
241
GA_PUSH_NAME("nga_access_ghosts");
242
pnga_access_ghost_ptr(g_a, dims, &ptr, ld);
245
* return patch address as the distance elements from the reference address
247
* .in Fortran we need only the index to the type array: dbl_mb or int_mb
248
* that are elements of COMMON in the the mafdecls.h include file
249
* .in C we need both the index and the pointer
252
elemsize = (unsigned long)GA[handle].elemsize;
254
/* compute index and check if it is correct */
255
switch (pnga_type_c2f(GA[handle].type)){
257
*index = (AccessIndex) ((DoublePrecision*)ptr - DBL_MB);
258
lref = (unsigned long)DBL_MB;
262
*index = (AccessIndex) ((DoubleComplex*)ptr - DCPL_MB);
263
lref = (unsigned long)DCPL_MB;
267
*index = (AccessIndex) ((SingleComplex*)ptr - SCPL_MB);
268
lref = (unsigned long)SCPL_MB;
272
*index = (AccessIndex) ((Integer*)ptr - INT_MB);
273
lref = (unsigned long)INT_MB;
277
*index = (AccessIndex) ((float*)ptr - FLT_MB);
278
lref = (unsigned long)FLT_MB;
283
#ifdef BYTE_ADDRESSABLE_MEMORY
284
/* check the allignment */
285
lptr = (unsigned long)ptr;
286
if( lptr%elemsize != lref%elemsize ){
287
printf("%d: lptr=%lu(%lu) lref=%lu(%lu)\n",(int)GAme,lptr,lptr%elemsize,
289
pnga_error("nga_access: MA addressing problem: base address misallignment",
294
/* adjust index for Fortran addressing */
301
/*\ RELEASE ACCESS TO A GHOST ELEMENT
303
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
304
# pragma weak wnga_release_ghost_element = pnga_release_ghost_element
306
void pnga_release_ghost_element(Integer g_a, Integer subscript[])
310
/*\ RELEASE ACCESS & UPDATE A GHOST ELEMENT
312
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
313
# pragma weak wnga_release_update_ghost_element = pnga_release_update_ghost_element
315
void pnga_release_update_ghost_element(Integer g_a, Integer subscript[])
319
/*\ RELEASE ACCESS TO A GHOST BLOCK
321
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
322
# pragma weak wnga_release_ghosts = pnga_release_ghosts
324
void pnga_release_ghosts(Integer g_a)
328
/*\ RELEASE ACCESS & UPDATE A GHOST BLOCK
330
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
331
# pragma weak wnga_release_update_ghosts = pnga_release_update_ghosts
333
void pnga_release_update_ghosts(Integer g_a)
337
/*\ GET DATA FROM LOCAL BLOCK
339
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
340
# pragma weak wnga_get_ghost_block = pnga_get_ghost_block
342
void pnga_get_ghost_block(Integer g_a,
348
/* g_a: Global array handle
349
lo[]: Array of lower indices of patch of global array
350
hi[]: Array of upper indices of patch of global array
351
buf[]: Local buffer that array patch will be copied into
352
ld[]: Array of physical ndim-1 dimensions of local buffer */
353
Integer handle=GA_OFFSET + g_a, ndim;
354
Integer i, glo[MAXDIM], ghi[MAXDIM], ichk, me, grp_id;
356
int stride_rem[MAXDIM], stride_loc[MAXDIM], count[MAXDIM];
357
Integer ldrem[MAXDIM];
358
Integer offset, factor, size;
362
grp_id = (Integer)GA[handle].p_handle;
363
if (grp_id>0) me = PGRP_LIST[grp_id].map_proc_list[me];
364
ndim = GA[handle].ndim;
366
/* Figure out whether or not lo and hi can be accessed completely
368
pnga_distribution(g_a, me, glo, ghi);
370
for (i=0; i<ndim; i++) {
371
if (lo[i] < glo[i]-(Integer)GA[handle].width[i]) ichk = 0;
372
if (hi[i] > ghi[i]+(Integer)GA[handle].width[i]) ichk = 0;
373
llo[i] = glo[i] - (Integer)GA[handle].width[i];
374
if (i<ndim-1) ldrem[i] = ghi[i] - glo[i] + 1
375
+ 2*(Integer)GA[handle].width[i];
378
/* Get data. Use local copy if possible, otherwise use a periodic get */
382
size = GA[handle].elemsize;
383
for (i=0; i<ndim-1; i++) {
384
offset += (lo[i]-llo[i])*factor;
385
factor *= ghi[i] - glo[i] + 1 + 2*(Integer)GA[handle].width[i];
387
offset += (lo[ndim-1]-llo[ndim-1])*factor;
388
ptr = GA[handle].ptr[me] + size*offset;
389
/* compute number of elements in each dimension and store result in count */
390
gam_ComputeCount(ndim, lo, hi, count);
392
/* scale first element in count by element size. The ARMCI_GetS
393
routine uses this convention to figure out memory sizes.*/
396
/* Return strides for memory containing global array on remote
397
processor indexed by proc (stride_rem) and for local buffer
399
gam_setstride(ndim, size, ld, ldrem, stride_rem, stride_loc);
400
ARMCI_GetS(ptr,stride_rem,buf,stride_loc,count,ndim-1,me);
402
pnga_periodic(g_a,lo,hi,buf,ld,NULL,PERIODIC_GET);
406
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY USING SHIFT ALGORITHM
408
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
409
# pragma weak wnga_update1_ghosts = pnga_update1_ghosts
411
void pnga_update1_ghosts(Integer g_a)
413
Integer idx, ipx, inx, i, np, handle=GA_OFFSET + g_a, proc_rem;
414
Integer size, ndim, nwidth, offset, slice, increment[MAXDIM];
415
Integer width[MAXDIM];
416
Integer dims[MAXDIM], imax=0;
417
Integer lo_loc[MAXDIM], hi_loc[MAXDIM];
418
Integer plo_loc[MAXDIM]/*, phi_loc[MAXDIM]*/;
419
Integer lo_rem[MAXDIM], hi_rem[MAXDIM];
420
Integer tlo_rem[MAXDIM], thi_rem[MAXDIM];
421
Integer slo_rem[MAXDIM], shi_rem[MAXDIM];
422
Integer plo_rem[MAXDIM], phi_rem[MAXDIM];
423
Integer ld_loc[MAXDIM], ld_rem[MAXDIM];
425
int stride_loc[MAXDIM], stride_rem[MAXDIM], count[MAXDIM];
426
char *ptr_loc, *ptr_rem;
427
logical hasData = TRUE;
428
Integer me = pnga_nodeid();
431
/* This routine makes use of the shift algorithm to update data in the
432
* ghost cells bounding the local block of visible data. The shift
433
* algorithm starts by updating the blocks of data along the first
434
* dimension by grabbing a block of data that is width[0] deep but
435
* otherwise matches the dimensions of the data residing on the
436
* calling processor. The update of the second dimension, however,
437
* grabs a block that is width[1] deep in the second dimension but is
438
* ldim0 + 2*width[0] in the first dimensions where ldim0 is the
439
* size of the visible data along the first dimension. The remaining
440
* dimensions are left the same. For the next update, the width of the
441
* second dimension is also increased by 2*width[1] and so on. This
442
* algorith makes use of the fact that data for the dimensions that
443
* have already been updated is available on each processor and can be
444
* used in the updates of subsequent dimensions. The total number of
445
* separate updates is 2*ndim, an update in the negative and positive
446
* directions for each dimension.
448
* To perform the update, this routine makes use of several copies of
449
* indices marking the upper and lower limits of data. Indices
450
* beginning with the character "p" are relative indices marking the
451
* location of the data set relative to the origin the local patch of
452
* the global array, all other indices are in absolute coordinates and
453
* mark locations in the total global array. The indices used by this
454
* routine are described below.
456
* lo_loc[], hi_loc[]: The lower and upper indices of the visible
457
* block of data held by the calling processor.
459
* lo_rem[], hi_rem[]: The lower and upper indices of the block
460
* of data on a remote processor or processors that is needed to
461
* fill in the calling processors ghost cells. These indices are
462
* NOT corrected for wrap-around (periodic) boundary conditions
463
* so they can be negative or greater than the array dimension
464
* values held in dims[].
466
* slo_rem[], shi_rem[]: Similar to lo_rem[] and hi_rem[], except
467
* that these indices have been corrected for wrap-around
468
* boundary conditions. If lo_rem[] and hi_rem[] cross a global
469
* array boundary, as opposed to being entirely located on one
470
* side or the other of the array, then two sets of slo_rem[] and
471
* shi_rem[] will be created. One set will correspond to the
472
* block of data on one side of the global array boundary and the
473
* other set will correspond to the remaining block. This
474
* situation will only occur if the value of the ghost cell width
475
* is greater than the dimension of the visible global array
476
* data on a single processor.
478
* thi_rem[], thi_rem[]: The lower and upper indices of the visible
479
* data on a remote processor.
481
* plo_loc[], phi_loc[]: The indices of the local data patch that
482
* is going to be updated.
484
* plo_rem[], phi_rem[]: The indices of the data patch on the
485
* remote processor that will be used to update the data on the
486
* calling processor. Note that the dimensions of the patches
487
* represented by plo_loc[], plo_rem[] and plo_loc[], phi_loc[]
490
* For the case where the width of the ghost cells is more than the
491
* width of the visible data held on a processor, special problems
492
* arise. It now takes several updates to fill in one block of boundary
493
* data and it is now necessary to keep track of where each of these
494
* blocks of data go in the ghost cell region. To do this two extra
495
* variables are needed. These are offset and slice. Slice is equal to
496
* the width of the visible data along the dimension being updated
497
* minus one coming from the remote processor. Offset is the amount
498
* that this data must be moved inward from the lower boundary of the
499
* ghost cell region. Another variable that is also used to handle
500
* this case is imax. If this variable is set to 2, then this means
501
* that the block of data that is needed to update the ghost cells
502
* crosses a global array boundary and the block needs to be broken
503
* up into two pieces. */
505
/* if global array has no ghost cells, just return */
506
if (!pnga_has_ghosts(g_a)) return;
508
GA_PUSH_NAME("ga_update1_ghosts");
510
size = GA[handle].elemsize;
511
ndim = GA[handle].ndim;
512
corner_flag = GA[handle].corner_flag;
513
p_handle = GA[handle].p_handle;
515
/* Get pointer to local memory */
516
ptr_loc = GA[handle].ptr[me];
517
/* obtain range of data that is held by local processor */
518
pnga_distribution(g_a,me,lo_loc,hi_loc);
519
/* initialize range increments and get array dimensions */
520
for (idx=0; idx < ndim; idx++) {
522
width[idx] = (Integer)GA[handle].width[idx];
523
dims[idx] = (Integer)GA[handle].dims[idx];
524
if (lo_loc[idx] == 0 && hi_loc[idx] == -1) hasData = FALSE;
527
/* loop over dimensions for sequential update using shift algorithm */
528
for (idx=0; idx < ndim; idx++) {
529
nwidth = (Integer)width[idx];
531
/* Do not bother with update if nwidth is zero or processor has
533
if (nwidth != 0 && hasData) {
535
/* Perform update in negative direction. Start by getting rough
536
estimate of block of needed data*/
537
for (i = 0; i < ndim; i++) {
539
lo_rem[i] = lo_loc[i] - nwidth;
540
hi_rem[i] = lo_loc[i] - 1;
541
/* Check to see if we will need to update ghost cells using
542
one or two major patches of the global array. */
553
lo_rem[i] = lo_loc[i];
554
hi_rem[i] = hi_loc[i];
558
for (inx = 0; inx < imax; inx++) {
559
/* Check to see if boundary is being updated in one patch or two,
560
adjust lower boundary accordingly. */
561
for (i=0; i<ndim; i++) {
562
if (imax == 2 && i == idx) {
565
shi_rem[i] = hi_rem[i];
567
slo_rem[i] = lo_rem[i] + dims[i];
568
shi_rem[i] = dims[i];
570
} else if (i == idx) {
572
slo_rem[i] = dims[i] - nwidth + 1;
573
shi_rem[i] = dims[i];
575
slo_rem[i] = lo_rem[i];
576
shi_rem[i] = hi_rem[i];
579
slo_rem[i] = lo_rem[i];
580
shi_rem[i] = hi_rem[i];
583
/* locate processor with this data */
584
if (!pnga_locate_region(g_a, slo_rem, shi_rem, _ga_map,
585
GA_proclist, &np)) ga_RegionError(pnga_ndim(g_a),
586
slo_rem, shi_rem, g_a);
588
for (ipx = 0; ipx < np; ipx++) {
589
/* Get actual coordinates of desired chunk of remote
590
data as well as the actual coordinates of the local chunk
591
of data that will receive the remote data (these
592
coordinates take into account the presence of ghost
593
cells). Start by finding out what data is actually held by
595
proc_rem = GA_proclist[ipx];
596
pnga_distribution(g_a, proc_rem, tlo_rem, thi_rem);
597
for (i = 0; i < ndim; i++) {
598
if (increment[i] == 0) {
600
if (np == 1 && imax == 1) {
601
plo_rem[i] = thi_rem[i] - tlo_rem[i] + 1;
602
phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];
604
/*phi_loc[i] = width[i] - 1;*/
606
if (tlo_rem[i] >= slo_rem[i]) {
607
offset = tlo_rem[i] - lo_rem[i];
608
slice = thi_rem[i] - tlo_rem[i];
611
slice = thi_rem[i] - slo_rem[i];
613
if (offset < 0) offset = offset + dims[i];
614
if (offset >= dims[i]) offset = offset - dims[i];
615
plo_rem[i] = thi_rem[i] - tlo_rem[i] + width[i] - slice;
616
phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];
618
/*phi_loc[i] = offset + slice;*/
621
plo_rem[i] = width[i];
622
phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];
623
plo_loc[i] = width[i];
624
/*phi_loc[i] = hi_loc[i] - lo_loc[i] + width[i];*/
628
phi_rem[i] = thi_rem[i] - tlo_rem[i] + increment[i];
630
/*phi_loc[i] = hi_loc[i] - lo_loc[i] + increment[i];*/
634
/* Get pointer to local data buffer and remote data
635
buffer as well as lists of leading dimenstions */
636
gam_LocationWithGhosts(me, handle, plo_loc, &ptr_loc, ld_loc);
637
gam_LocationWithGhosts(proc_rem, handle, plo_rem, &ptr_rem, ld_rem);
639
/* Evaluate strides on local and remote processors */
640
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
643
/* Compute the number of elements in each dimension and store
644
result in count. Scale the first element in count by the
646
gam_ComputeCount(ndim, plo_rem, phi_rem, count);
649
/* get remote data */
651
proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem];
653
ARMCI_GetS(ptr_rem, stride_rem, ptr_loc, stride_loc, count,
654
(int)(ndim - 1), (int)proc_rem);
658
/* Perform update in positive direction. Start by getting rough
659
estimate of block of needed data*/
660
for (i = 0; i < ndim; i++) {
662
lo_rem[i] = hi_loc[i] + 1;
663
hi_rem[i] = hi_loc[i] + nwidth;
664
/* Check to see if we will need to update ghost cells using
665
one or two major patches of the global array. */
666
if (hi_rem[i] > dims[i]) {
667
if (lo_rem[i] <= dims[i]) {
676
lo_rem[i] = lo_loc[i];
677
hi_rem[i] = hi_loc[i];
681
for (inx = 0; inx < imax; inx++) {
682
/* Check to see if boundary is being updated in one patch or two,
683
adjust lower boundary accordingly. */
684
for (i=0; i<ndim; i++) {
685
if (imax == 2 && i == idx) {
687
slo_rem[i] = lo_rem[i];
688
shi_rem[i] = dims[i];
691
shi_rem[i] = hi_rem[i] - dims[i];
693
} else if (i == idx) {
694
if (hi_rem[i] > dims[i]) {
698
slo_rem[i] = lo_rem[i];
699
shi_rem[i] = hi_rem[i];
702
slo_rem[i] = lo_rem[i];
703
shi_rem[i] = hi_rem[i];
706
/* locate processor with this data */
707
if (!pnga_locate_region(g_a, slo_rem, shi_rem, _ga_map,
708
GA_proclist, &np)) ga_RegionError(pnga_ndim(g_a),
709
slo_rem, shi_rem, g_a);
711
for (ipx = 0; ipx < np; ipx++) {
712
/* Get actual coordinates of desired chunk of remote
713
data as well as the actual coordinates of the local chunk
714
of data that will receive the remote data (these
715
coordinates take into account the presence of ghost
716
cells). Start by finding out what data is actually held by
718
proc_rem = GA_proclist[ipx];
719
pnga_distribution(g_a, proc_rem, tlo_rem, thi_rem);
720
for (i = 0; i < ndim; i++) {
721
if (increment[i] == 0) {
723
if (np == 1 && imax == 1) {
724
plo_rem[i] = width[i];
725
phi_rem[i] = 2*width[i] - 1;
726
plo_loc[i] = hi_loc[i] - lo_loc[i] + 1 + width[i];
727
/*phi_loc[i] = hi_loc[i] - lo_loc[i] + 2*width[i];*/
729
offset = tlo_rem[i] - hi_loc[i] - 1;
730
if (thi_rem[i] <= shi_rem[i]) {
731
slice = thi_rem[i] - tlo_rem[i];
733
slice = shi_rem[i] - tlo_rem[i];
735
if (offset < 0) offset = offset + dims[i];
736
if (offset >= dims[i]) offset = offset - dims[i];
737
plo_rem[i] = width[i];
738
phi_rem[i] = width[i] + slice;
739
plo_loc[i] = hi_loc[i] - lo_loc[i] + width[i] + 1 + offset;
740
/*phi_loc[i] = hi_loc[i] - lo_loc[i] + width[i] + 1
744
plo_rem[i] = width[i];
745
phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];
746
plo_loc[i] = width[i];
747
/*phi_loc[i] = hi_loc[i] - lo_loc[i] + width[i];*/
751
phi_rem[i] = thi_rem[i] - tlo_rem[i] + increment[i];
753
/*phi_loc[i] = hi_loc[i] - lo_loc[i] + increment[i];*/
757
/* Get pointer to local data buffer and remote data
758
buffer as well as lists of leading dimenstions */
759
gam_LocationWithGhosts(me, handle, plo_loc, &ptr_loc, ld_loc);
760
gam_LocationWithGhosts(proc_rem, handle, plo_rem, &ptr_rem, ld_rem);
762
/* Evaluate strides on local and remote processors */
763
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
766
/* Compute the number of elements in each dimension and store
767
result in count. Scale the first element in count by the
769
gam_ComputeCount(ndim, plo_rem, phi_rem, count);
772
/* get remote data */
774
proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem];
776
ARMCI_GetS(ptr_rem, stride_rem, ptr_loc, stride_loc, count,
777
(int)(ndim - 1), (int)proc_rem);
781
/* synchronize all processors and update increment array */
782
if (idx < ndim-1) pnga_sync();
784
increment[idx] = 2*nwidth;
790
/*\ UTILITY FUNCTION TO MAKE SURE GHOST CELLS WIDTHS ARE
791
* LESS THAN VISIBLE DATA WIDTHS
793
static logical gai_check_ghost_distr(Integer g_a)
795
Integer handle=GA_OFFSET + g_a;
796
Integer idx, ndim, np, ipx;
797
ndim = GA[handle].ndim;
799
for (idx = 0; idx < ndim; idx++) {
800
for (np = 0; np < GA[handle].nblock[idx]; np++) {
801
if (np < GA[handle].nblock[idx] - 1) {
802
if (GA[handle].mapc[ipx+1]-GA[handle].mapc[ipx]+1
803
<GA[handle].width[idx]) {
807
if (GA[handle].dims[idx]-GA[handle].mapc[ipx]+1
808
<GA[handle].width[idx]) {
818
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY USING PUT CALLS
820
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
821
# pragma weak wnga_update2_ghosts = pnga_update2_ghosts
823
logical pnga_update2_ghosts(Integer g_a)
825
Integer idx, ipx, np, handle=GA_OFFSET + g_a, proc_rem;
826
Integer ntot, mask[MAXDIM];
827
Integer size, ndim, i, itmp;
828
Integer width[MAXDIM], dims[MAXDIM];
829
Integer lo_loc[MAXDIM], hi_loc[MAXDIM];
830
/*Integer tlo_loc[MAXDIM], thi_loc[MAXDIM];*/
831
Integer plo_loc[MAXDIM], phi_loc[MAXDIM];
832
Integer tlo_rem[MAXDIM], thi_rem[MAXDIM];
833
Integer plo_rem[MAXDIM];
834
Integer ld_loc[MAXDIM], ld_rem[MAXDIM];
836
int stride_loc[MAXDIM], stride_rem[MAXDIM],count[MAXDIM];
837
char *ptr_loc, *ptr_rem;
838
Integer me = pnga_nodeid();
841
/* if global array has no ghost cells, just return */
842
if (!pnga_has_ghosts(g_a)) {
846
size = GA[handle].elemsize;
847
ndim = GA[handle].ndim;
848
p_handle = GA[handle].p_handle;
849
/* initialize ghost cell widths and get array dimensions */
850
for (idx=0; idx < ndim; idx++) {
851
width[idx] = (Integer)GA[handle].width[idx];
852
dims[idx] = (Integer)GA[handle].dims[idx];
855
/* Check to make sure that global array is well-behaved (all processors
856
have data and the width of the data in each dimension is greater than
857
the corresponding value in width[]). */
858
if (!gai_check_ghost_distr(g_a)) return FALSE;
860
GA_PUSH_NAME("ga_update2_ghosts");
861
/* Get pointer to local memory */
862
ptr_loc = GA[handle].ptr[me];
863
/* obtain range of data that is held by local processor */
864
pnga_distribution(g_a,me,lo_loc,hi_loc);
866
/* evaluate total number of PUT operations that will be required */
868
for (idx=0; idx < ndim; idx++) ntot *= 3;
870
/* Loop over all PUT operations. The operation corresponding to the
871
mask of all zeros is left out. */
872
for (ipx=0; ipx < ntot; ipx++) {
873
/* Convert ipx to corresponding mask values */
876
for (idx = 0; idx < ndim; idx++) {
879
if (mask[idx] != 0) mask0 = FALSE;
884
/* check to see if ghost cell block has zero elements*/
887
for (idx = 0; idx < ndim; idx++) {
888
if (mask[idx] != 0 && width[idx] == 0) mask0 = TRUE;
889
if (mask[idx] != 0) itmp++;
891
/*if (itmp>1) mask0 = TRUE; */
893
/* Now that mask has been determined, find data that is to be moved
894
* and identify processor to which it is going. Wrap boundaries
895
* around, if necessary */
896
for (idx = 0; idx < ndim; idx++) {
897
if (mask[idx] == 0) {
898
/*tlo_loc[idx] = lo_loc[idx];*/
899
/*thi_loc[idx] = hi_loc[idx];*/
900
tlo_rem[idx] = lo_loc[idx];
901
thi_rem[idx] = hi_loc[idx];
902
} else if (mask[idx] == -1) {
903
/*tlo_loc[idx] = lo_loc[idx];*/
904
/*thi_loc[idx] = lo_loc[idx]+width[idx]-1;*/
905
if (lo_loc[idx] > 1) {
906
tlo_rem[idx] = lo_loc[idx]-width[idx];
907
thi_rem[idx] = lo_loc[idx]-1;
909
tlo_rem[idx] = dims[idx]-width[idx]+1;
910
thi_rem[idx] = dims[idx];
912
} else if (mask[idx] == 1) {
913
/*tlo_loc[idx] = hi_loc[idx]-width[idx]+1;*/
914
/*thi_loc[idx] = hi_loc[idx];*/
915
if (hi_loc[idx] < dims[idx]) {
916
tlo_rem[idx] = hi_loc[idx] + 1;
917
thi_rem[idx] = hi_loc[idx] + width[idx];
920
thi_rem[idx] = width[idx];
923
fprintf(stderr,"Illegal mask value found\n");
926
/* Locate remote processor to which data must be sent */
927
if (!pnga_locate_region(g_a, tlo_rem, thi_rem, _ga_map,
928
GA_proclist, &np)) ga_RegionError(pnga_ndim(g_a),
929
tlo_rem, thi_rem, g_a);
931
fprintf(stderr,"More than one remote processor found\n");
933
/* Remote processor has been identified, now get ready to send
934
data to it. Start by getting distribution on remote
936
proc_rem = GA_proclist[0];
937
pnga_distribution(g_a, proc_rem, tlo_rem, thi_rem);
938
for (idx = 0; idx < ndim; idx++) {
939
if (mask[idx] == 0) {
940
plo_loc[idx] = width[idx];
941
phi_loc[idx] = hi_loc[idx]-lo_loc[idx]+width[idx];
942
plo_rem[idx] = plo_loc[idx];
943
} else if (mask[idx] == -1) {
944
plo_loc[idx] = width[idx];
945
phi_loc[idx] = 2*width[idx]-1;
946
plo_rem[idx] = thi_rem[idx]-tlo_rem[idx]+width[idx]+1;
947
} else if (mask[idx] == 1) {
948
plo_loc[idx] = hi_loc[idx]-lo_loc[idx]+1;
949
phi_loc[idx] = hi_loc[idx]-lo_loc[idx]+width[idx];
953
/* Get pointer to local data buffer and remote data
954
buffer as well as lists of leading dimenstions */
955
gam_LocationWithGhosts(me, handle, plo_loc, &ptr_loc, ld_loc);
956
gam_LocationWithGhosts(proc_rem, handle, plo_rem, &ptr_rem, ld_rem);
958
/* Evaluate strides on local and remote processors */
959
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
962
/* Compute the number of elements in each dimension and store
963
result in count. Scale the first element in count by the
965
gam_ComputeCount(ndim, plo_loc, phi_loc, count);
968
/* put data on remote processor */
969
/*ARMCI_PutS(ptr_loc, stride_loc, ptr_rem, stride_rem, count,
970
(int)(ndim - 1), (int)proc_rem);*/
972
proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem];
974
ARMCI_NbPutS(ptr_loc, stride_loc, ptr_rem, stride_rem, count,
975
(int)(ndim - 1), (int)proc_rem, NULL);
983
/*\ GET INDICES ON REMOTE BLOCK IN NEGATIVE DIRECTION FOR UPDATE
985
static void get_remote_block_neg(Integer idx, Integer ndim, Integer *lo_loc,
986
Integer *hi_loc, Integer *slo_rem, Integer *shi_rem,
987
Integer *dims, Integer *width)
989
Integer i, lo_rem[MAXDIM], hi_rem[MAXDIM];
990
/*Start by getting rough idea of where data needs to go. */
991
for (i = 0; i < ndim; i++) {
993
lo_rem[i] = lo_loc[i] - width[i];
994
hi_rem[i] = lo_loc[i] - 1;
996
lo_rem[i] = lo_loc[i];
997
hi_rem[i] = hi_loc[i];
1001
/* Account for boundaries, if necessary. */
1002
for (i=0; i<ndim; i++) {
1004
if (lo_rem[i] < 1) {
1005
slo_rem[i] = dims[i] - width[i] + 1;
1006
shi_rem[i] = dims[i];
1008
slo_rem[i] = lo_rem[i];
1009
shi_rem[i] = hi_rem[i];
1012
slo_rem[i] = lo_rem[i];
1013
shi_rem[i] = hi_rem[i];
1018
/*\ GET INDICES ON REMOTE BLOCK IN POSITIVE DIRECTION FOR UPDATE
1020
static void get_remote_block_pos(Integer idx, Integer ndim, Integer *lo_loc,
1021
Integer *hi_loc, Integer *slo_rem, Integer *shi_rem,
1022
Integer *dims, Integer *width)
1024
Integer i, lo_rem[MAXDIM], hi_rem[MAXDIM];
1025
/* Start by getting rough idea of where data needs to go. */
1026
for (i = 0; i < ndim; i++) {
1028
lo_rem[i] = hi_loc[i] + 1;
1029
hi_rem[i] = hi_loc[i] + width[i];
1031
lo_rem[i] = lo_loc[i];
1032
hi_rem[i] = hi_loc[i];
1036
/* Account for boundaries, if necessary. */
1037
for (i=0; i<ndim; i++) {
1039
if (hi_rem[i] > dims[i]) {
1041
shi_rem[i] = width[i];
1043
slo_rem[i] = lo_rem[i];
1044
shi_rem[i] = hi_rem[i];
1047
slo_rem[i] = lo_rem[i];
1048
shi_rem[i] = hi_rem[i];
1053
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY USING SHIFT ALGORITHM AND PUT CALLS
1055
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
1056
# pragma weak wnga_update3_ghosts = pnga_update3_ghosts
1058
logical pnga_update3_ghosts(Integer g_a)
1060
Integer idx, i, np, handle=GA_OFFSET + g_a, proc_rem;
1061
Integer size, ndim, nwidth, increment[MAXDIM];
1062
Integer width[MAXDIM];
1063
Integer dims[MAXDIM];
1064
Integer lo_loc[MAXDIM], hi_loc[MAXDIM];
1065
Integer plo_loc[MAXDIM]/*, phi_loc[MAXDIM]*/;
1066
Integer tlo_rem[MAXDIM], thi_rem[MAXDIM];
1067
Integer slo_rem[MAXDIM], shi_rem[MAXDIM];
1068
Integer plo_rem[MAXDIM], phi_rem[MAXDIM];
1069
Integer ld_loc[MAXDIM], ld_rem[MAXDIM];
1070
int stride_loc[MAXDIM], stride_rem[MAXDIM],count[MAXDIM];
1071
char *ptr_loc, *ptr_rem;
1072
Integer me = pnga_nodeid();
1075
/* This routine makes use of the shift algorithm to update data in the
1076
* ghost cells bounding the local block of visible data. The shift
1077
* algorithm starts by updating the blocks of data along the first
1078
* dimension by grabbing a block of data that is width[0] deep but
1079
* otherwise matches the dimensions of the data residing on the
1080
* calling processor. The update of the second dimension, however,
1081
* grabs a block that is width[1] deep in the second dimension but is
1082
* ldim0 + 2*width[0] in the first dimensions where ldim0 is the
1083
* size of the visible data along the first dimension. The remaining
1084
* dimensions are left the same. For the next update, the width of the
1085
* second dimension is also increased by 2*width[1] and so on. This
1086
* algorith makes use of the fact that data for the dimensions that
1087
* have already been updated is available on each processor and can be
1088
* used in the updates of subsequent dimensions. The total number of
1089
* separate updates is 2*ndim, an update in the negative and positive
1090
* directions for each dimension. This implementation uses simple put
1091
* operations to perform the updates along each dimension with an
1092
* intervening synchronization call being used to make sure that the
1093
* necessary data is available on each processor before starting the
1094
* update along the next dimension.
1096
* To perform the update, this routine makes use of several copies of
1097
* indices marking the upper and lower limits of data. Indices
1098
* beginning with the character "p" are relative indices marking the
1099
* location of the data set relative to the origin the local patch of
1100
* the global array, all other indices are in absolute coordinates and
1101
* mark locations in the total global array. The indices used by this
1102
* routine are described below.
1104
* lo_loc[], hi_loc[]: The lower and upper indices of the visible
1105
* block of data held by the calling processor.
1107
* lo_rem[], hi_rem[]: The lower and upper indices of the block
1108
* of data on a remote processor or processors that is needed to
1109
* fill in the calling processors ghost cells. These indices are
1110
* NOT corrected for wrap-around (periodic) boundary conditions
1111
* so they can be negative or greater than the array dimension
1112
* values held in dims[].
1114
* slo_rem[], shi_rem[]: Similar to lo_rem[] and hi_rem[], except
1115
* that these indices have been corrected for wrap-around
1116
* boundary conditions.
1118
* thi_rem[], thi_rem[]: The lower and upper indices of the visible
1119
* data on a remote processor.
1121
* plo_loc[], phi_loc[]: The indices of the local data patch that
1122
* is going to be updated.
1124
* plo_rem[], phi_rem[]: The indices of the data patch on the
1125
* remote processor that will be used to update the data on the
1126
* calling processor. Note that the dimensions of the patches
1127
* represented by plo_loc[], plo_rem[] and plo_loc[], phi_loc[]
1131
/* if global array has no ghost cells, just return */
1132
if (!pnga_has_ghosts(g_a)) return TRUE;
1134
size = GA[handle].elemsize;
1135
ndim = GA[handle].ndim;
1136
p_handle = GA[handle].p_handle;
1138
/* obtain range of data that is held by local processor */
1139
pnga_distribution(g_a,me,lo_loc,hi_loc);
1141
/* initialize range increments and get array dimensions */
1142
for (idx=0; idx < ndim; idx++) {
1144
width[idx] = (Integer)GA[handle].width[idx];
1145
dims[idx] = (Integer)GA[handle].dims[idx];
1146
if (lo_loc[idx] == 0 && hi_loc[idx] == -1) return FALSE;
1149
/* Check to make sure that global array is well-behaved (all processors
1150
have data and the width of the data in each dimension is greater
1151
than the corresponding value in width[]. */
1152
if (!gai_check_ghost_distr(g_a)) return FALSE;
1154
GA_PUSH_NAME("ga_update3_ghosts");
1156
/* Get pointer to local memory */
1157
ptr_loc = GA[handle].ptr[me];
1159
/* loop over dimensions for sequential update using shift algorithm */
1160
for (idx=0; idx < ndim; idx++) {
1161
nwidth = width[idx];
1163
/* Do not bother with update if nwidth is zero */
1166
/* Perform update in negative direction. */
1167
get_remote_block_neg(idx, ndim, lo_loc, hi_loc, slo_rem, shi_rem,
1169
/* locate processor with this data */
1170
if (!pnga_locate_region(g_a, slo_rem, shi_rem, _ga_map,
1171
GA_proclist, &np)) ga_RegionError(pnga_ndim(g_a),
1172
slo_rem, shi_rem, g_a);
1174
/* Get actual coordinates of desired location of remote
1175
data as well as the actual coordinates of the local chunk
1176
of data that will be sent to remote processor (these
1177
coordinates take into account the presence of ghost
1178
cells). Start by finding out what data is actually held by
1179
remote processor. */
1180
proc_rem = GA_proclist[0];
1181
pnga_distribution(g_a, proc_rem, tlo_rem, thi_rem);
1182
for (i = 0; i < ndim; i++) {
1183
if (increment[i] == 0) {
1185
plo_rem[i] = thi_rem[i] - tlo_rem[i] + width[i] + 1;
1186
phi_rem[i] = thi_rem[i] - tlo_rem[i] + 2*width[i];
1187
plo_loc[i] = width[i];
1188
/*phi_loc[i] = 2*width[i] - 1;*/
1190
plo_rem[i] = width[i];
1191
phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];
1192
plo_loc[i] = width[i];
1193
/*phi_loc[i] = hi_loc[i] - lo_loc[i] + width[i];*/
1197
phi_rem[i] = thi_rem[i] - tlo_rem[i] + increment[i];
1199
/*phi_loc[i] = hi_loc[i] - lo_loc[i] + increment[i];*/
1203
/* Get pointer to local data buffer and remote data
1204
buffer as well as lists of leading dimenstions */
1205
gam_LocationWithGhosts(me, handle, plo_loc, &ptr_loc, ld_loc);
1206
gam_LocationWithGhosts(proc_rem, handle, plo_rem, &ptr_rem, ld_rem);
1208
/* Evaluate strides on local and remote processors */
1209
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
1212
/* Compute the number of elements in each dimension and store
1213
result in count. Scale the first element in count by the
1215
gam_ComputeCount(ndim, plo_rem, phi_rem, count);
1218
/* Put local data on remote processor */
1219
if (p_handle >= 0) {
1220
proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem];
1222
ARMCI_PutS(ptr_loc, stride_loc, ptr_rem, stride_rem, count,
1223
(int)(ndim - 1), (int)proc_rem);
1225
/* Perform update in positive direction */
1226
get_remote_block_pos(idx, ndim, lo_loc, hi_loc, slo_rem, shi_rem,
1228
/* locate processor with this data */
1229
if (!pnga_locate_region(g_a, slo_rem, shi_rem, _ga_map,
1230
GA_proclist, &np)) ga_RegionError(pnga_ndim(g_a),
1231
slo_rem, shi_rem, g_a);
1233
/* Get actual coordinates of desired chunk of remote
1234
data as well as the actual coordinates of the local chunk
1235
of data that will receive the remote data (these
1236
coordinates take into account the presence of ghost
1237
cells). Start by finding out what data is actually held by
1238
remote processor. */
1239
proc_rem = GA_proclist[0];
1240
pnga_distribution(g_a, proc_rem, tlo_rem, thi_rem);
1241
for (i = 0; i < ndim; i++) {
1242
if (increment[i] == 0) {
1245
phi_rem[i] = width[i] - 1;
1246
plo_loc[i] = hi_loc[i] - lo_loc[i] + width[i] - 1;
1247
/*phi_loc[i] = hi_loc[i] - lo_loc[i] + 2*width[i] - 1;*/
1249
plo_rem[i] = width[i];
1250
phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];
1251
plo_loc[i] = width[i];
1252
/*phi_loc[i] = hi_loc[i] - lo_loc[i] + width[i];*/
1256
phi_rem[i] = thi_rem[i] - tlo_rem[i] + increment[i];
1258
/*phi_loc[i] = hi_loc[i] - lo_loc[i] + increment[i];*/
1262
/* Get pointer to local data buffer and remote data
1263
buffer as well as lists of leading dimenstions */
1264
gam_LocationWithGhosts(me, handle, plo_loc, &ptr_loc, ld_loc);
1265
gam_LocationWithGhosts(proc_rem, handle, plo_rem, &ptr_rem, ld_rem);
1267
/* Evaluate strides on local and remote processors */
1268
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
1271
/* Compute the number of elements in each dimension and store
1272
result in count. Scale the first element in count by the
1274
gam_ComputeCount(ndim, plo_rem, phi_rem, count);
1277
/* get remote data */
1278
if (p_handle >= 0) {
1279
proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem];
1281
ARMCI_PutS(ptr_loc, stride_loc, ptr_rem, stride_rem, count,
1282
(int)(ndim - 1), (int)proc_rem);
1284
/* synchronize all processors and update increment array */
1285
if (idx < ndim-1) pnga_sync();
1286
increment[idx] = 2*nwidth;
1293
#define GHOST_PRINT 0
1294
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY USING SHIFT ALGORITHM AND
1297
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
1298
# pragma weak wnga_set_update4_info = pnga_set_update4_info
1300
logical pnga_set_update4_info(Integer g_a)
1302
Integer idx, idir, i, np, handle=GA_OFFSET + g_a;
1303
Integer size, buflen, buftot, *bufsize, ndim, increment[MAXDIM];
1304
Integer *proc_rem_snd, *proc_rem_rcv;
1306
Integer width[MAXDIM], dims[MAXDIM], index[MAXDIM];
1307
Integer lo_loc[MAXDIM], hi_loc[MAXDIM];
1308
Integer plo_snd[MAXDIM], phi_snd[MAXDIM];
1309
Integer lo_rcv[MAXDIM], hi_rcv[MAXDIM];
1310
Integer slo_rcv[MAXDIM], shi_rcv[MAXDIM];
1311
Integer plo_rcv[MAXDIM], phi_rcv[MAXDIM];
1312
Integer ld_loc[MAXDIM];
1313
int *stride_snd, *stride_rcv, *count, cache_size;
1314
/*int corner_flag;*/
1315
char **ptr_snd, **ptr_rcv, *cache;
1317
Integer me = pnga_nodeid();
1320
/* This routine sets the arrays that are used to transfer data using
1321
* the update4. To perform the update, this routine makes use of several
1322
* copies of indices marking the upper and lower limits of data. Indices
1323
* beginning with the character "p" are relative indices marking the
1324
* location of the data set relative to the origin the local patch of
1325
* the global array, all other indices are in absolute coordinates and
1326
* mark locations in the total global array. The indices used by this
1327
* routine are described below.
1329
* lo_loc[], hi_loc[]: The lower and upper indices of the visible
1330
* block of data held by the calling processor.
1332
* lo_rcv[], hi_rcv[]: The lower and upper indices of the blocks
1333
* of data that will be either sent to or received from a remote
1334
* processor. These indices are NOT corrected for wrap-around
1335
* (periodic) boundary conditions so they can be negative or greater
1336
* than the array dimension values held in dims[].
1338
* slo_rcv[], shi_rcv[]: Similar to lo_rcv[] and hi_rcv[], except
1339
* that these indices have been corrected for wrap-around
1340
* boundary conditions.
1342
* plo_rcv[], phi_rcv[]: The local indices of the local data patch
1343
* that receive that message from the remote processor.
1345
* plo_snd[], phi_snd[]: The local indices of the data patch
1346
* that will be sent to the remote processor. Note that the
1347
* dimensions of the patches represented by plo_rec[], plo_rec[] and
1348
* plo_snd[], phi_snd[] must be the same.
1351
/* if global array has no ghost cells, just return */
1352
if (!pnga_has_ghosts(g_a)) return TRUE;
1354
/* Check to make sure that global array is well-behaved (all processors
1355
have data and the width of the data in each dimension is greater
1356
than the corresponding value in width[]. */
1357
if (!gai_check_ghost_distr(g_a)) return FALSE;
1359
size = GA[handle].elemsize;
1360
ndim = GA[handle].ndim;
1361
p_handle = GA[handle].p_handle;
1362
cache_size = 2*sizeof(char *)+3*ndim*sizeof(int)+3*sizeof(Integer);
1363
cache_size = 2* ndim *((cache_size/8) + 1) + 1;
1364
GA[handle].cache = (double *)malloc(sizeof(double)*cache_size);
1365
cache = (char *)GA[handle].cache;
1366
/*corner_flag = GA[handle].corner_flag;*/
1368
printf("p[%d]a cache_size: %d\n",GAme,cache_size);
1371
/* initialize range increments and get array dimensions */
1373
pnga_distribution(g_a,me,lo_loc,hi_loc);
1374
for (idx=0; idx < ndim; idx++) {
1376
width[idx] = (Integer)GA[handle].width[idx];
1377
dims[idx] = (Integer)GA[handle].dims[idx];
1378
if (lo_loc[idx] == 0 && hi_loc[idx] == -1) {
1379
*(char**)cache = NULL;
1384
/* Get indices of processor in virtual grid */
1385
pnga_proc_topology(g_a, me, index);
1387
/* Try to find maximum size of message that will be sent during
1388
* update operations and use this to allocate memory for message
1389
* passing buffers. */
1391
for (i=0; i<ndim; i++) {
1392
buftot *= (hi_loc[i]-lo_loc[i] + 1 + 2*width[i]);
1395
for (i = 0; i < ndim; i++) {
1396
idir = hi_loc[i] - lo_loc[i] + 1;
1397
if (buflen < (buftot/(idir + 2*width[i]))*width[i]) {
1398
buflen = (buftot/(idir + 2*width[i]))*width[i];
1401
bufsize = (Integer*)cache;
1403
printf("p[%d]a initial pointer: %d\n",GAme,(Integer)bufsize);
1406
current = (char*)(bufsize+1);
1408
*bufsize = size*buflen;
1410
printf("p[%d]a buflen: %d size: %d bufsize: %d\n",GAme,buflen,size,*bufsize);
1414
/* loop over dimensions for sequential update using shift algorithm */
1415
for (idx=0; idx < ndim; idx++) {
1417
/* Do not bother with update if nwidth is zero */
1418
if (width[idx] != 0) {
1420
ptr_snd = (char**)current;
1421
ptr_rcv = (char**)(ptr_snd+1);
1422
proc_rem_snd = (Integer*)(ptr_rcv+1);
1423
proc_rem_rcv = (Integer*)(proc_rem_snd+1);
1424
stride_snd = (int*)(proc_rem_rcv+1);
1425
stride_rcv = (int*)(stride_snd+ndim);
1426
length = (Integer*)(stride_rcv+ndim);
1427
count = (int*)(length+1);
1428
current = (char*)(count+ndim);
1430
/* Perform update in negative direction. */
1431
get_remote_block_neg(idx, ndim, lo_loc, hi_loc, slo_rcv, shi_rcv,
1433
/* locate processor with this data */
1434
if (!pnga_locate_region(g_a, slo_rcv, shi_rcv, _ga_map,
1435
GA_proclist, &np)) ga_RegionError(pnga_ndim(g_a),
1436
slo_rcv, shi_rcv, g_a);
1437
*proc_rem_snd = GA_proclist[0];
1438
if (p_handle >= 0) {
1439
*proc_rem_snd = PGRP_LIST[p_handle].inv_map_proc_list[*proc_rem_snd];
1442
/* Find processor from which data will be recieved */
1443
for (i = 0; i < ndim; i++) {
1445
lo_rcv[i] = hi_loc[i] + 1;
1446
hi_rcv[i] = hi_loc[i] + width[i];
1448
lo_rcv[i] = lo_loc[i];
1449
hi_rcv[i] = hi_loc[i];
1453
/* Account for boundaries, if necessary. */
1454
for (i=0; i<ndim; i++) {
1456
if (hi_rcv[i] > dims[i]) {
1458
shi_rcv[i] = width[i];
1460
slo_rcv[i] = lo_rcv[i];
1461
shi_rcv[i] = hi_rcv[i];
1464
slo_rcv[i] = lo_rcv[i];
1465
shi_rcv[i] = hi_rcv[i];
1468
/* locate processor with this data */
1469
if (!pnga_locate_region(g_a, slo_rcv, shi_rcv, _ga_map,
1470
GA_proclist, &np)) ga_RegionError(pnga_ndim(g_a),
1471
slo_rcv, shi_rcv, g_a);
1472
*proc_rem_rcv = GA_proclist[0];
1473
if (p_handle >= 0) {
1474
*proc_rem_rcv = PGRP_LIST[p_handle].inv_map_proc_list[*proc_rem_rcv];
1477
/* Get actual coordinates of chunk of data that will be sent to
1478
* remote processor as well as coordinates of the array space that
1479
* will receive data from remote processor. */
1480
for (i = 0; i < ndim; i++) {
1481
if (increment[i] == 0) {
1483
plo_snd[i] = width[i];
1484
phi_snd[i] = 2*width[i] - 1;
1485
plo_rcv[i] = hi_loc[i] - lo_loc[i] + width[i] + 1;
1486
phi_rcv[i] = hi_loc[i] - lo_loc[i] + 2*width[i];
1488
plo_snd[i] = width[i];
1489
phi_snd[i] = hi_loc[i] - lo_loc[i] + width[i];
1490
plo_rcv[i] = width[i];
1491
phi_rcv[i] = hi_loc[i] - lo_loc[i] + width[i];
1495
phi_rcv[i] = hi_loc[i] - lo_loc[i] + increment[i];
1497
phi_snd[i] = hi_loc[i] - lo_loc[i] + increment[i];
1501
/* Get pointer to local data buffer and remote data
1502
buffer as well as lists of leading dimenstions */
1503
gam_LocationWithGhosts(me, handle, plo_snd, ptr_snd, ld_loc);
1505
printf("p[%d]a 1: plo_snd[0]: %d plo_snd[1]: %d ptr_snd: %d\n",
1506
GAme, plo_snd[0], plo_snd[1], (Integer)*ptr_snd);
1509
gam_LocationWithGhosts(me, handle, plo_rcv, ptr_rcv, ld_loc);
1511
printf("p[%d]a 1: plo_rcv[0]: %d plo_rcv[1]: %d ptr_rcv: %d\n",
1512
GAme, plo_rcv[0], plo_rcv[1], (Integer)*ptr_rcv);
1516
/* Evaluate strides for send and recieve */
1517
gam_setstride(ndim, size, ld_loc, ld_loc, stride_rcv,
1520
/* Compute the number of elements in each dimension and store
1521
result in count. Scale the first element in count by the
1523
gam_ComputeCount(ndim, plo_rcv, phi_rcv, count);
1524
gam_CountElems(ndim, plo_snd, phi_snd, length);
1529
printf("p[%d]a 1: length: %d bufsize: %d proc_rem_snd: %d proc_rem_rcv: %d\n",
1530
GAme, *length, *bufsize, (int)*proc_rem_snd, (int)*proc_rem_rcv);
1531
printf("p[%d]a 1: count[0]: %d stride_rcv[0]: %d stride_rcv[1]: %d\n",
1532
GAme, count[0], stride_rcv[0],stride_rcv[1]);
1533
printf("p[%d]a 1: count[1]: %d stride_rcv[2]: %d stride_rcv[3]: %d\n",
1534
GAme, count[1], stride_rcv[2],stride_rcv[3]);
1535
printf("p[%d]a 1: count[2]: %d stride_snd[0]: %d stride_snd[1]: %d\n",
1536
GAme, count[2], stride_snd[0],stride_snd[1]);
1537
printf("p[%d]a 1: count[3]: %d stride_snd[2]: %d stride_snd[3]: %d\n",
1538
GAme, count[3], stride_snd[2],stride_snd[3]);
1542
ptr_snd = (char**)current;
1543
ptr_rcv = (char**)(ptr_snd+1);
1544
proc_rem_snd = (Integer*)(ptr_rcv+1);
1545
proc_rem_rcv = (Integer*)(proc_rem_snd+1);
1546
stride_snd = (int*)(proc_rem_rcv+1);
1547
stride_rcv = (int*)(stride_snd+ndim);
1548
length = (Integer*)(stride_rcv+ndim);
1549
count = (int*)(length+1);
1550
current = (char*)(count+ndim);
1552
/* Find parameters for message in positive direction. */
1553
get_remote_block_pos(idx, ndim, lo_loc, hi_loc, slo_rcv, shi_rcv,
1555
/* locate processor with this data */
1556
if (!pnga_locate_region(g_a, slo_rcv, shi_rcv, _ga_map,
1557
GA_proclist, &np)) ga_RegionError(pnga_ndim(g_a),
1558
slo_rcv, shi_rcv, g_a);
1559
*proc_rem_snd = GA_proclist[0];
1560
if (p_handle >= 0) {
1561
*proc_rem_snd = PGRP_LIST[p_handle].inv_map_proc_list[*proc_rem_snd];
1564
/* Find processor from which data will be recieved */
1565
for (i = 0; i < ndim; i++) {
1567
lo_rcv[i] = lo_loc[i] - width[i];
1568
hi_rcv[i] = lo_loc[i] - 1;
1570
lo_rcv[i] = lo_loc[i];
1571
hi_rcv[i] = hi_loc[i];
1575
/* Account for boundaries, if necessary. */
1576
for (i=0; i<ndim; i++) {
1578
if (hi_rcv[i] < 1) {
1579
slo_rcv[i] = dims[i] - width[i] + 1;
1580
shi_rcv[i] = dims[i];
1582
slo_rcv[i] = lo_rcv[i];
1583
shi_rcv[i] = hi_rcv[i];
1586
slo_rcv[i] = lo_rcv[i];
1587
shi_rcv[i] = hi_rcv[i];
1590
/* locate processor with this data */
1591
if (!pnga_locate_region(g_a, slo_rcv, shi_rcv, _ga_map,
1592
GA_proclist, &np)) ga_RegionError(pnga_ndim(g_a),
1593
slo_rcv, shi_rcv, g_a);
1594
*proc_rem_rcv = GA_proclist[0];
1595
if (p_handle >= 0) {
1596
*proc_rem_rcv = PGRP_LIST[p_handle].inv_map_proc_list[*proc_rem_rcv];
1598
/* Get actual coordinates of chunk of data that will be sent to
1599
* remote processor as well as coordinates of the array space that
1600
* will receive data from remote processor. */
1601
for (i = 0; i < ndim; i++) {
1602
if (increment[i] == 0) {
1604
plo_snd[i] = hi_loc[i] - lo_loc[i] + 1;
1605
phi_snd[i] = hi_loc[i] - lo_loc[i] + width[i];
1607
phi_rcv[i] = width[i] - 1;
1609
plo_snd[i] = width[i];
1610
phi_snd[i] = hi_loc[i] - lo_loc[i] + width[i];
1611
plo_rcv[i] = width[i];
1612
phi_rcv[i] = hi_loc[i] - lo_loc[i] + width[i];
1616
phi_rcv[i] = hi_loc[i] - lo_loc[i] + increment[i];
1618
phi_snd[i] = hi_loc[i] - lo_loc[i] + increment[i];
1622
/* Get pointer to local data buffer and remote data
1623
buffer as well as lists of leading dimenstions */
1624
gam_LocationWithGhosts(me, handle, plo_snd, ptr_snd, ld_loc);
1626
printf("p[%d]a 2: plo_snd[0]: %d plo_snd[1]: %d ptr_snd: %d\n",
1627
GAme, plo_snd[0], plo_snd[1], (Integer)*ptr_snd);
1630
gam_LocationWithGhosts(me, handle, plo_rcv, ptr_rcv, ld_loc);
1632
printf("p[%d]a 2: plo_rcv[0]: %d plo_rcv[1]: %d ptr_rcv: %d\n",
1633
GAme, plo_rcv[0], plo_rcv[1], (Integer)*ptr_rcv);
1637
/* Evaluate strides for send and recieve */
1638
gam_setstride(ndim, size, ld_loc, ld_loc, stride_rcv,
1641
/* Compute the number of elements in each dimension and store
1642
result in count. Scale the first element in count by the
1644
gam_ComputeCount(ndim, plo_rcv, phi_rcv, count);
1645
gam_CountElems(ndim, plo_snd, phi_snd, length);
1649
printf("p[%d]a 2: length: %d bufsize: %d proc_rem_snd: %d proc_rem_rcv: %d\n",
1650
GAme, *length, *bufsize, (int)*proc_rem_snd, (int)*proc_rem_rcv);
1651
printf("p[%d]a 2: count[0]: %d stride_rcv[0]: %d stride_rcv[1]: %d\n",
1652
GAme, count[0], stride_rcv[0],stride_rcv[1]);
1653
printf("p[%d]a 2: count[1]: %d stride_rcv[2]: %d stride_rcv[3]: %d\n",
1654
GAme, count[1], stride_rcv[2],stride_rcv[3]);
1655
printf("p[%d]a 2: count[2]: %d stride_snd[0]: %d stride_snd[1]: %d\n",
1656
GAme, count[2], stride_snd[0],stride_snd[1]);
1657
printf("p[%d]a 2: count[3]: %d stride_snd[2]: %d stride_snd[3]: %d\n",
1658
GAme, count[3], stride_snd[2],stride_snd[3]);
1663
if (GA[handle].corner_flag)
1664
increment[idx] = 2*width[idx];
1667
printf("p[%d]a final pointer: %d\n",GAme,(Integer)(Integer*)current);
1673
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY USING SHIFT ALGORITHM AND
1676
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
1677
# pragma weak wnga_update4_ghosts = pnga_update4_ghosts
1679
logical pnga_update4_ghosts(Integer g_a)
1681
Integer idx, i, handle=GA_OFFSET + g_a;
1682
Integer *size, bufsize, buflen, ndim, elemsize;
1683
Integer *proc_rem_snd, *proc_rem_rcv, pmax;
1684
Integer msgcnt, *length;
1685
Integer index[MAXDIM], width[MAXDIM];
1686
int *stride_snd, *stride_rcv, *count, msglen;
1687
char **ptr_snd, **ptr_rcv, *cache, *current;
1688
char send_name[32], rcv_name[32];
1689
void *snd_ptr, *rcv_ptr, *snd_ptr_orig, *rcv_ptr_orig;
1690
Integer me = pnga_nodeid();
1691
/*Integer p_handle;*/
1693
/* This routine makes use of the shift algorithm to update data in the
1694
* ghost cells bounding the local block of visible data. The shift
1695
* algorithm starts by updating the blocks of data along the first
1696
* dimension by grabbing a block of data that is width[0] deep but
1697
* otherwise matches the dimensions of the data residing on the
1698
* calling processor. The update of the second dimension, however,
1699
* grabs a block that is width[1] deep in the second dimension but is
1700
* ldim0 + 2*width[0] in the first dimensions where ldim0 is the
1701
* size of the visible data along the first dimension. The remaining
1702
* dimensions are left the same. For the next update, the width of the
1703
* second dimension is also increased by 2*width[1] and so on. This
1704
* algorith makes use of the fact that data for the dimensions that
1705
* have already been updated is available on each processor and can be
1706
* used in the updates of subsequent dimensions. The total number of
1707
* separate updates is 2*ndim, an update in the negative and positive
1708
* directions for each dimension.
1710
* This implementation make use of explicit message passing to perform
1711
* the update. Separate message types for the updates in each coordinate
1712
* direction are used to maintain synchronization locally and to
1713
* guarantee that the data is present before the updates in a new
1714
* coordinate direction take place.
1717
/* if global array has no ghost cells, just return */
1718
if (!pnga_has_ghosts(g_a)) return TRUE;
1720
ndim = GA[handle].ndim;
1721
/*p_handle = GA[handle].p_handle;*/
1722
cache = (char *)GA[handle].cache;
1723
elemsize = GA[handle].elemsize;
1724
for (i=0; i<ndim; i++) {
1725
width[i] = (Integer)GA[handle].width[i];
1728
GA_PUSH_NAME("ga_update4_ghosts");
1731
/* Get indices of processor in virtual grid */
1732
pnga_proc_topology(g_a, me, index);
1734
size = (Integer*)cache;
1735
current = (char*)(size+1);
1737
buflen = bufsize/elemsize;
1739
printf("p[%d] bufsize: %d buflen: %d\n", GAme, bufsize, buflen);
1743
strcpy(send_name,"send_buffer");
1744
strcpy(rcv_name,"receive_buffer");
1745
snd_ptr_orig = snd_ptr = ga_malloc(buflen, GA[handle].type, send_name);
1746
rcv_ptr_orig = rcv_ptr = ga_malloc(buflen, GA[handle].type, rcv_name);
1748
/* loop over dimensions for sequential update using shift algorithm */
1749
for (idx=0; idx < ndim; idx++) {
1751
/* Do not bother with update if nwidth is zero */
1752
if (width[idx] != 0) {
1754
/* send messages in negative direction */
1755
snd_ptr = snd_ptr_orig;
1756
rcv_ptr = rcv_ptr_orig;
1758
ptr_snd = (char**)current;
1759
ptr_rcv = (char**)(ptr_snd+1);
1760
proc_rem_snd = (Integer*)(ptr_rcv+1);
1761
proc_rem_rcv = (Integer*)(proc_rem_snd+1);
1762
stride_snd = (int*)(proc_rem_rcv+1);
1763
stride_rcv = (int*)(stride_snd+ndim);
1764
length = (Integer*)(stride_rcv+ndim);
1765
count = (int*)(length+1);
1766
current = (char*)(count+ndim);
1769
printf("p[%d] 1: ptr_snd: %d ptr_rcv: %d\n", GAme, (Integer)*ptr_snd,
1771
printf("p[%d] 1: length: %d proc_rem_snd: %d proc_rem_rcv: %d\n",
1772
GAme, (int)*length, (int)*proc_rem_snd, (int)*proc_rem_rcv);
1773
printf("p[%d] 1: count[0]: %d stride_rcv[0]: %d stride_rcv[1]: %d\n",
1774
GAme, count[0], stride_rcv[0],stride_rcv[1]);
1775
printf("p[%d] 1: count[1]: %d stride_rcv[2]: %d stride_rcv[3]: %d\n",
1776
GAme, count[1], stride_rcv[2],stride_rcv[3]);
1777
printf("p[%d] 1: count[2]: %d stride_snd[0]: %d stride_snd[1]: %d\n",
1778
GAme, count[2], stride_snd[0],stride_snd[1]);
1779
printf("p[%d] 1: count[3]: %d stride_snd[2]: %d stride_snd[3]: %d\n",
1780
GAme, count[3], stride_snd[2],stride_snd[3]);
1781
printf("p[%d] 1: snd_ptr: %d rcv_ptr: %d\n", GAme, (Integer)snd_ptr,
1785
/* Fill send buffer with data. */
1786
armci_write_strided(*ptr_snd, (int)ndim-1, stride_snd, count, snd_ptr);
1788
printf("p[%d] completed armci_write_strided\n",GAme);
1792
/* Send Messages. If processor has odd index in direction idx, it
1793
* sends message first, if processor has even index it receives
1794
* message first. Then process is reversed. Also need to account
1795
* for whether or not there are an odd number of processors along
1796
* update direction. */
1798
if (GAme != *proc_rem_snd) {
1799
if (GA[handle].nblock[idx]%2 == 0) {
1800
if (index[idx]%2 != 0) {
1801
armci_msg_snd(msgcnt, snd_ptr, *length, *proc_rem_snd);
1803
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, *proc_rem_rcv);
1805
if (index[idx]%2 != 0) {
1806
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, *proc_rem_rcv);
1808
armci_msg_snd(msgcnt, snd_ptr, *length, *proc_rem_snd);
1811
pmax = GA[handle].nblock[idx] - 1;
1812
if (index[idx]%2 != 0) {
1813
armci_msg_snd(msgcnt, snd_ptr, *length, *proc_rem_snd);
1814
} else if (index[idx] != pmax) {
1815
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, *proc_rem_rcv);
1817
if (index[idx]%2 != 0) {
1818
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, *proc_rem_rcv);
1819
} else if (index[idx] != 0) {
1820
armci_msg_snd(msgcnt, snd_ptr, *length, *proc_rem_snd);
1822
/* make up for odd processor at end of string */
1823
if (index[idx] == 0) {
1824
armci_msg_snd(msgcnt, snd_ptr, *length, *proc_rem_snd);
1826
if (index[idx] == pmax) {
1827
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, *proc_rem_rcv);
1834
/* copy data back into global array */
1835
armci_read_strided(*ptr_rcv, (int)ndim-1, stride_rcv, count, rcv_ptr);
1837
printf("p[%d] completed armci_read_strided\n",GAme);
1841
/* send messages in positive direction */
1842
snd_ptr = snd_ptr_orig;
1843
rcv_ptr = rcv_ptr_orig;
1845
ptr_snd = (char**)current;
1846
ptr_rcv = (char**)(ptr_snd+1);
1847
proc_rem_snd = (Integer*)(ptr_rcv+1);
1848
proc_rem_rcv = (Integer*)(proc_rem_snd+1);
1849
stride_snd = (int*)(proc_rem_rcv+1);
1850
stride_rcv = (int*)(stride_snd+ndim);
1851
length = (Integer*)(stride_rcv+ndim);
1852
count = (int*)(length+1);
1853
current = (char*)(count+ndim);
1856
printf("p[%d] 2: ptr_snd: %d ptr_rcv: %d\n", GAme, (Integer)*ptr_snd,
1858
printf("p[%d] 2: length: %d proc_rem_snd: %d proc_rem_rcv: %d\n",
1859
GAme, (int)*length, (int)*proc_rem_snd, (int)*proc_rem_rcv);
1860
printf("p[%d] 2: count[0]: %d stride_rcv[0]: %d stride_rcv[1]: %d\n",
1861
GAme, count[0], stride_rcv[0],stride_rcv[1]);
1862
printf("p[%d] 2: count[1]: %d stride_rcv[2]: %d stride_rcv[3]: %d\n",
1863
GAme, count[1], stride_rcv[2],stride_rcv[3]);
1864
printf("p[%d] 2: count[2]: %d stride_snd[0]: %d stride_snd[1]: %d\n",
1865
GAme, count[2], stride_snd[0],stride_snd[1]);
1866
printf("p[%d] 2: count[3]: %d stride_snd[2]: %d stride_snd[3]: %d\n",
1867
GAme, count[3], stride_snd[2],stride_snd[3]);
1868
printf("p[%d] 2: snd_ptr: %d rcv_ptr: %d\n", GAme, (Integer)snd_ptr,
1872
/* Fill send buffer with data. */
1873
armci_write_strided(*ptr_snd, (int)ndim-1, stride_snd, count, snd_ptr);
1875
/* Send Messages. If processor has odd index in direction idx, it
1876
* sends message first, if processor has even index it receives
1877
* message first. Then process is reversed. Also need to account
1878
* for whether or not there are an odd number of processors along
1879
* update direction. */
1881
if (GAme != *proc_rem_rcv) {
1882
if (GA[handle].nblock[idx]%2 == 0) {
1883
if (index[idx]%2 != 0) {
1884
armci_msg_snd(msgcnt, snd_ptr, *length, *proc_rem_snd);
1886
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, *proc_rem_rcv);
1888
if (index[idx]%2 != 0) {
1889
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, *proc_rem_rcv);
1891
armci_msg_snd(msgcnt, snd_ptr, *length, *proc_rem_snd);
1894
pmax = GA[handle].nblock[idx] - 1;
1895
if (index[idx]%2 != 0) {
1896
armci_msg_snd(msgcnt, snd_ptr, *length, *proc_rem_snd);
1897
} else if (index[idx] != 0) {
1898
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, *proc_rem_rcv);
1900
if (index[idx]%2 != 0) {
1901
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, *proc_rem_rcv);
1902
} else if (index[idx] != pmax) {
1903
armci_msg_snd(msgcnt, snd_ptr, *length, *proc_rem_snd);
1905
/* make up for odd processor at end of string */
1906
if (index[idx] == pmax) {
1907
armci_msg_snd(msgcnt, snd_ptr, *length, *proc_rem_snd);
1909
if (index[idx] == 0) {
1910
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, *proc_rem_rcv);
1916
/* copy data back into global array */
1917
armci_read_strided(*ptr_rcv, (int)ndim-1, stride_rcv, count, rcv_ptr);
1922
ga_free(rcv_ptr_orig);
1923
ga_free(snd_ptr_orig);
1928
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY USING SHIFT ALGORITHM AND
1931
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
1932
# pragma weak wnga_update44_ghosts = pnga_update44_ghosts
1934
logical pnga_update44_ghosts(Integer g_a)
1936
Integer idx, idir, i, np, handle=GA_OFFSET + g_a;
1937
Integer size, buflen, buftot, bufsize, ndim, increment[MAXDIM];
1938
Integer proc_rem_snd, proc_rem_rcv, pmax;
1939
Integer msgcnt, length;
1940
Integer width[MAXDIM], dims[MAXDIM], index[MAXDIM];
1941
Integer lo_loc[MAXDIM], hi_loc[MAXDIM];
1942
Integer plo_snd[MAXDIM], phi_snd[MAXDIM];
1943
Integer lo_rcv[MAXDIM], hi_rcv[MAXDIM];
1944
Integer slo_rcv[MAXDIM], shi_rcv[MAXDIM];
1945
Integer plo_rcv[MAXDIM], phi_rcv[MAXDIM];
1946
Integer ld_loc[MAXDIM];
1948
int stride_snd[MAXDIM], stride_rcv[MAXDIM],count[MAXDIM];
1949
char *ptr_snd, *ptr_rcv;
1950
char send_name[32], rcv_name[32];
1951
void *snd_ptr, *rcv_ptr, *snd_ptr_orig, *rcv_ptr_orig;
1952
Integer me = pnga_nodeid();
1955
/* This routine makes use of the shift algorithm to update data in the
1956
* ghost cells bounding the local block of visible data. The shift
1957
* algorithm starts by updating the blocks of data along the first
1958
* dimension by grabbing a block of data that is width[0] deep but
1959
* otherwise matches the dimensions of the data residing on the
1960
* calling processor. The update of the second dimension, however,
1961
* grabs a block that is width[1] deep in the second dimension but is
1962
* ldim0 + 2*width[0] in the first dimensions where ldim0 is the
1963
* size of the visible data along the first dimension. The remaining
1964
* dimensions are left the same. For the next update, the width of the
1965
* second dimension is also increased by 2*width[1] and so on. This
1966
* algorith makes use of the fact that data for the dimensions that
1967
* have already been updated is available on each processor and can be
1968
* used in the updates of subsequent dimensions. The total number of
1969
* separate updates is 2*ndim, an update in the negative and positive
1970
* directions for each dimension.
1972
* This implementation make use of explicit message passing to perform
1973
* the update. Separate message types for the updates in each coordinate
1974
* direction are used to maintain synchronization locally and to
1975
* guarantee that the data is present before the updates in a new
1976
* coordinate direction take place.
1978
* To perform the update, this routine makes use of several copies of
1979
* indices marking the upper and lower limits of data. Indices
1980
* beginning with the character "p" are relative indices marking the
1981
* location of the data set relative to the origin the local patch of
1982
* the global array, all other indices are in absolute coordinates and
1983
* mark locations in the total global array. The indices used by this
1984
* routine are described below.
1986
* lo_loc[], hi_loc[]: The lower and upper indices of the visible
1987
* block of data held by the calling processor.
1989
* lo_rcv[], hi_rcv[]: The lower and upper indices of the blocks
1990
* of data that will be either sent to or received from a remote
1991
* processor. These indices are NOT corrected for wrap-around
1992
* (periodic) boundary conditions so they can be negative or greater
1993
* than the array dimension values held in dims[].
1995
* slo_rcv[], shi_rcv[]: Similar to lo_rcv[] and hi_rcv[], except
1996
* that these indices have been corrected for wrap-around
1997
* boundary conditions.
1999
* plo_rcv[], phi_rcv[]: The local indices of the local data patch
2000
* that receive that message from the remote processor.
2002
* plo_snd[], phi_snd[]: The local indices of the data patch
2003
* that will be sent to the remote processor. Note that the
2004
* dimensions of the patches represented by plo_rec[], plo_rec[] and
2005
* plo_snd[], phi_snd[] must be the same.
2008
/* if global array has no ghost cells, just return */
2009
if (!pnga_has_ghosts(g_a)) return TRUE;
2011
size = GA[handle].elemsize;
2012
ndim = GA[handle].ndim;
2013
p_handle = GA[handle].p_handle;
2015
/* initialize range increments and get array dimensions */
2016
for (idx=0; idx < ndim; idx++) {
2018
width[idx] = (Integer)GA[handle].width[idx];
2019
dims[idx] = (Integer)GA[handle].dims[idx];
2022
/* Check to make sure that global array is well-behaved (all processors
2023
have data and the width of the data in each dimension is greater
2024
than the corresponding value in width[]. */
2025
if (!gai_check_ghost_distr(g_a)) return FALSE;
2027
GA_PUSH_NAME("ga_update4_ghosts");
2030
/* obtain range of data that is held by local processor */
2031
pnga_distribution(g_a,me,lo_loc,hi_loc);
2032
/* Get indices of processor in virtual grid */
2033
pnga_proc_topology(g_a, me, index);
2035
/* Try to find maximum size of message that will be sent during
2036
* update operations and use this to allocate memory for message
2037
* passing buffers. */
2039
for (i=0; i<ndim; i++) {
2040
buftot *= (hi_loc[i]-lo_loc[i] + 1 + 2*width[i]);
2043
for (i = 0; i < ndim; i++) {
2044
idir = hi_loc[i] - lo_loc[i] + 1;
2045
if (buflen < (buftot/(idir + 2*width[i]))*width[i]) {
2046
buflen = (buftot/(idir + 2*width[i]))*width[i];
2049
bufsize = size*buflen;
2050
strcpy(send_name,"send_buffer");
2051
strcpy(rcv_name,"receive_buffer");
2052
snd_ptr_orig = snd_ptr = ga_malloc(buflen, GA[handle].type, send_name);
2053
rcv_ptr_orig = rcv_ptr = ga_malloc(buflen, GA[handle].type, rcv_name);
2055
/* loop over dimensions for sequential update using shift algorithm */
2056
for (idx=0; idx < ndim; idx++) {
2058
/* Do not bother with update if nwidth is zero */
2059
if (width[idx] != 0) {
2061
/* Perform update in negative direction. */
2062
get_remote_block_neg(idx, ndim, lo_loc, hi_loc, slo_rcv, shi_rcv,
2064
/* locate processor with this data */
2065
if (!pnga_locate_region(g_a, slo_rcv, shi_rcv, _ga_map,
2066
GA_proclist, &np)) ga_RegionError(pnga_ndim(g_a),
2067
slo_rcv, shi_rcv, g_a);
2068
proc_rem_snd = GA_proclist[0];
2069
if (p_handle >= 0) {
2070
proc_rem_snd = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem_snd];
2073
/* Find processor from which data will be recieved */
2074
for (i = 0; i < ndim; i++) {
2076
lo_rcv[i] = hi_loc[i] + 1;
2077
hi_rcv[i] = hi_loc[i] + width[i];
2079
lo_rcv[i] = lo_loc[i];
2080
hi_rcv[i] = hi_loc[i];
2084
/* Account for boundaries, if necessary. */
2085
for (i=0; i<ndim; i++) {
2087
if (hi_rcv[i] > dims[i]) {
2089
shi_rcv[i] = width[i];
2091
slo_rcv[i] = lo_rcv[i];
2092
shi_rcv[i] = hi_rcv[i];
2095
slo_rcv[i] = lo_rcv[i];
2096
shi_rcv[i] = hi_rcv[i];
2099
/* locate processor with this data */
2100
if (!pnga_locate_region(g_a, slo_rcv, shi_rcv, _ga_map,
2101
GA_proclist, &np)) ga_RegionError(pnga_ndim(g_a),
2102
slo_rcv, shi_rcv, g_a);
2103
proc_rem_rcv = GA_proclist[0];
2104
if (p_handle >= 0) {
2105
proc_rem_rcv = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem_rcv];
2108
/* Get actual coordinates of chunk of data that will be sent to
2109
* remote processor as well as coordinates of the array space that
2110
* will receive data from remote processor. */
2111
for (i = 0; i < ndim; i++) {
2112
if (increment[i] == 0) {
2114
plo_snd[i] = width[i];
2115
phi_snd[i] = 2*width[i] - 1;
2116
plo_rcv[i] = hi_loc[i] - lo_loc[i] + width[i] + 1;
2117
phi_rcv[i] = hi_loc[i] - lo_loc[i] + 2*width[i];
2119
plo_snd[i] = width[i];
2120
phi_snd[i] = hi_loc[i] - lo_loc[i] + width[i];
2121
plo_rcv[i] = width[i];
2122
phi_rcv[i] = hi_loc[i] - lo_loc[i] + width[i];
2126
phi_rcv[i] = hi_loc[i] - lo_loc[i] + increment[i];
2128
phi_snd[i] = hi_loc[i] - lo_loc[i] + increment[i];
2132
/* Get pointer to local data buffer and remote data
2133
buffer as well as lists of leading dimenstions */
2134
gam_LocationWithGhosts(me, handle, plo_snd, &ptr_snd, ld_loc);
2136
printf("p[%d] 1: plo_snd[0]: %d plo_snd[1]: %d ptr_snd: %d\n",
2137
GAme, plo_snd[0], plo_snd[1], (Integer)ptr_snd);
2140
gam_LocationWithGhosts(me, handle, plo_rcv, &ptr_rcv, ld_loc);
2142
printf("p[%d] 1: plo_rcv[0]: %d plo_rcv[1]: %d ptr_rcv: %d\n",
2143
GAme, plo_rcv[0], plo_rcv[1], (Integer)ptr_rcv);
2147
/* Evaluate strides for send and recieve */
2148
gam_setstride(ndim, size, ld_loc, ld_loc, stride_rcv,
2151
/* Compute the number of elements in each dimension and store
2152
result in count. Scale the first element in count by the
2154
gam_ComputeCount(ndim, plo_rcv, phi_rcv, count);
2155
gam_CountElems(ndim, plo_snd, phi_snd, &length);
2159
/* Fill send buffer with data. */
2161
printf("p[%d]b 1: ptr_snd: %d ptr_rcv: %d\n", GAme, (Integer)ptr_snd,
2163
printf("p[%d]b 1: length: %d proc_rem_snd: %d proc_rem_rcv: %d\n",
2164
GAme, (int)length, (int)proc_rem_snd, (int)proc_rem_rcv);
2165
printf("p[%d]b 1: count[0]: %d stride_rcv[0]: %d stride_rcv[1]: %d\n",
2166
GAme, count[0], stride_rcv[0],stride_rcv[1]);
2167
printf("p[%d]b 1: count[1]: %d stride_rcv[2]: %d stride_rcv[3]: %d\n",
2168
GAme, count[1], stride_rcv[2],stride_rcv[3]);
2169
printf("p[%d]b 1: count[2]: %d stride_snd[0]: %d stride_snd[1]: %d\n",
2170
GAme, count[2], stride_snd[0],stride_snd[1]);
2171
printf("p[%d]b 1: count[3]: %d stride_snd[2]: %d stride_snd[3]: %d\n",
2172
GAme, count[3], stride_snd[2],stride_snd[3]);
2173
printf("p[%d]b 1: snd_ptr: %d rcv_ptr: %d\n", GAme, (Integer)snd_ptr,
2177
armci_write_strided(ptr_snd, (int)ndim-1, stride_snd, count, snd_ptr);
2179
/* Send Messages. If processor has odd index in direction idx, it
2180
* sends message first, if processor has even index it receives
2181
* message first. Then process is reversed. Also need to account
2182
* for whether or not there are an odd number of processors along
2183
* update direction. */
2186
printf("p[%d] 1: msgcnt: %d length: %d bufsize: %d proc_rem_snd: %d proc_rem_rcv: %d\n",
2187
GAme, msgcnt, length, bufsize, (int)proc_rem_snd, (int)proc_rem_rcv);
2190
snd_ptr = snd_ptr_orig;
2191
rcv_ptr = rcv_ptr_orig;
2192
if (GAme != proc_rem_snd) {
2193
if (GA[handle].nblock[idx]%2 == 0) {
2194
if (index[idx]%2 != 0) {
2195
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
2197
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
2199
if (index[idx]%2 != 0) {
2200
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
2202
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
2205
pmax = GA[handle].nblock[idx] - 1;
2206
if (index[idx]%2 != 0) {
2207
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
2208
} else if (index[idx] != pmax) {
2209
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
2211
if (index[idx]%2 != 0) {
2212
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
2213
} else if (index[idx] != 0) {
2214
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
2216
/* make up for odd processor at end of string */
2217
if (index[idx] == 0) {
2218
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
2220
if (index[idx] == pmax) {
2221
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
2228
/* copy data back into global array */
2229
armci_read_strided(ptr_rcv, (int)ndim-1, stride_rcv, count, rcv_ptr);
2231
/* Find parameters for message in positive direction. */
2232
get_remote_block_pos(idx, ndim, lo_loc, hi_loc, slo_rcv, shi_rcv,
2234
/* locate processor with this data */
2235
if (!pnga_locate_region(g_a, slo_rcv, shi_rcv, _ga_map,
2236
GA_proclist, &np)) ga_RegionError(pnga_ndim(g_a),
2237
slo_rcv, shi_rcv, g_a);
2238
proc_rem_snd = GA_proclist[0];
2239
if (p_handle >= 0) {
2240
proc_rem_snd = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem_snd];
2243
/* Find processor from which data will be recieved */
2244
for (i = 0; i < ndim; i++) {
2246
lo_rcv[i] = lo_loc[i] - width[i];
2247
hi_rcv[i] = lo_loc[i] - 1;
2249
lo_rcv[i] = lo_loc[i];
2250
hi_rcv[i] = hi_loc[i];
2254
/* Account for boundaries, if necessary. */
2255
for (i=0; i<ndim; i++) {
2257
if (hi_rcv[i] < 1) {
2258
slo_rcv[i] = dims[i] - width[i] + 1;
2259
shi_rcv[i] = dims[i];
2261
slo_rcv[i] = lo_rcv[i];
2262
shi_rcv[i] = hi_rcv[i];
2265
slo_rcv[i] = lo_rcv[i];
2266
shi_rcv[i] = hi_rcv[i];
2269
/* locate processor with this data */
2270
if (!pnga_locate_region(g_a, slo_rcv, shi_rcv, _ga_map,
2271
GA_proclist, &np)) ga_RegionError(pnga_ndim(g_a),
2272
slo_rcv, shi_rcv, g_a);
2273
proc_rem_rcv = GA_proclist[0];
2274
if (p_handle >= 0) {
2275
proc_rem_rcv = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem_rcv];
2277
/* Get actual coordinates of chunk of data that will be sent to
2278
* remote processor as well as coordinates of the array space that
2279
* will receive data from remote processor. */
2280
for (i = 0; i < ndim; i++) {
2281
if (increment[i] == 0) {
2283
plo_snd[i] = hi_loc[i] - lo_loc[i] + 1;
2284
phi_snd[i] = hi_loc[i] - lo_loc[i] + width[i];
2286
phi_rcv[i] = width[i] - 1;
2288
plo_snd[i] = width[i];
2289
phi_snd[i] = hi_loc[i] - lo_loc[i] + width[i];
2290
plo_rcv[i] = width[i];
2291
phi_rcv[i] = hi_loc[i] - lo_loc[i] + width[i];
2295
phi_rcv[i] = hi_loc[i] - lo_loc[i] + increment[i];
2297
phi_snd[i] = hi_loc[i] - lo_loc[i] + increment[i];
2301
/* Get pointer to local data buffer and remote data
2302
buffer as well as lists of leading dimenstions */
2303
gam_LocationWithGhosts(me, handle, plo_snd, &ptr_snd, ld_loc);
2305
printf("p[%d] 2: plo_snd[0]: %d plo_snd[1]: %d ptr_snd: %d\n",
2306
GAme, plo_snd[0], plo_snd[1], (Integer)ptr_snd);
2309
gam_LocationWithGhosts(me, handle, plo_rcv, &ptr_rcv, ld_loc);
2311
printf("p[%d] 2: plo_rcv[0]: %d plo_rcv[1]: %d ptr_rcv: %d\n",
2312
GAme, plo_rcv[0], plo_rcv[1], (Integer)ptr_rcv);
2316
/* Evaluate strides for send and recieve */
2317
gam_setstride(ndim, size, ld_loc, ld_loc, stride_rcv,
2320
/* Compute the number of elements in each dimension and store
2321
result in count. Scale the first element in count by the
2323
gam_ComputeCount(ndim, plo_rcv, phi_rcv, count);
2324
gam_CountElems(ndim, plo_snd, phi_snd, &length);
2328
/* Need to reallocate memory if length > buflen */
2331
/* Fill send buffer with data. */
2333
printf("p[%d]b 2: ptr_snd: %d ptr_rcv: %d\n", GAme, (Integer)ptr_snd,
2335
printf("p[%d]b 2: length: %d proc_rem_snd: %d proc_rem_rcv: %d\n",
2336
GAme, (int)length, (int)proc_rem_snd, (int)proc_rem_rcv);
2337
printf("p[%d]b 2: count[0]: %d stride_rcv[0]: %d stride_rcv[1]: %d\n",
2338
GAme, count[0], stride_rcv[0],stride_rcv[1]);
2339
printf("p[%d]b 2: count[1]: %d stride_rcv[2]: %d stride_rcv[3]: %d\n",
2340
GAme, count[1], stride_rcv[2],stride_rcv[3]);
2341
printf("p[%d]b 2: count[2]: %d stride_snd[0]: %d stride_snd[1]: %d\n",
2342
GAme, count[2], stride_snd[0],stride_snd[1]);
2343
printf("p[%d]b 2: count[3]: %d stride_snd[2]: %d stride_snd[3]: %d\n",
2344
GAme, count[3], stride_snd[2],stride_snd[3]);
2345
printf("p[%d]b 2: snd_ptr: %d rcv_ptr: %d\n", GAme, (Integer)snd_ptr,
2349
armci_write_strided(ptr_snd, (int)ndim-1, stride_snd, count, snd_ptr);
2351
/* Send Messages. If processor has odd index in direction idx, it
2352
* sends message first, if processor has even index it receives
2353
* message first. Then process is reversed. Also need to account
2354
* for whether or not there are an odd number of processors along
2355
* update direction. */
2358
printf("p[%d] 2: msgcnt: %d length: %d bufsize: %d proc_rem_snd: %d proc_rem_rcv: %d\n",
2359
GAme, msgcnt, length, bufsize, (int)proc_rem_snd, (int)proc_rem_rcv);
2362
snd_ptr = snd_ptr_orig;
2363
rcv_ptr = rcv_ptr_orig;
2364
if (GAme != proc_rem_rcv) {
2365
if (GA[handle].nblock[idx]%2 == 0) {
2366
if (index[idx]%2 != 0) {
2367
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
2369
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
2371
if (index[idx]%2 != 0) {
2372
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
2374
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
2377
pmax = GA[handle].nblock[idx] - 1;
2378
if (index[idx]%2 != 0) {
2379
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
2380
} else if (index[idx] != 0) {
2381
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
2383
if (index[idx]%2 != 0) {
2384
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
2385
} else if (index[idx] != pmax) {
2386
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
2388
/* make up for odd processor at end of string */
2389
if (index[idx] == pmax) {
2390
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
2392
if (index[idx] == 0) {
2393
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
2399
/* copy data back into global array */
2400
armci_read_strided(ptr_rcv, (int)ndim-1, stride_rcv, count, rcv_ptr);
2403
if (GA[handle].corner_flag)
2404
increment[idx] = 2*width[idx];
2407
ga_free(rcv_ptr_orig);
2408
ga_free(snd_ptr_orig);
2413
/* Utility function for ga_update5_ghosts routine */
2414
static inline double waitforflags (int *ptr1, int *ptr2)
2418
while (*ptr1 == 0 || *ptr2 == 0) {
2419
val = exp(-(double)i++);
2422
printf("%d: flags set at %p and %p\n",GAme,ptr1,ptr2); fflush(stdout);
2428
/* Stub in new ARMCI_PutS_flag call until actual implementation is
2430
int ARMCI_PutS_flag__(
2431
void* src_ptr, /* pointer to 1st segment at source */
2432
int src_stride_arr[], /* array of strides at source */
2433
void* dst_ptr, /* pointer to 1st segment at destination */
2434
int dst_stride_arr[], /* array of strides at destination */
2435
int count[], /* number of units at each stride level,
2436
count[0] = #bytes */
2437
int stride_levels, /* number of stride levels */
2438
int *flag, /* pointer to remote flag */
2439
int *val, /* pointer to value to set flag upon completion of
2441
int proc /* remote process(or) ID */
2445
/* Put local data on remote processor */
2446
ARMCI_PutS(src_ptr, src_stride_arr, dst_ptr, dst_stride_arr,
2447
count, stride_levels, proc);
2449
/* Send signal to remote processor that data transfer has
2450
* been completed. */
2451
bytes = sizeof(int);
2452
ARMCI_Put(val, flag, bytes, proc);
2457
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY USING SHIFT ALGORITHM AND PUT CALLS
2458
* WITHOUT ANY BARRIERS
2460
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
2461
# pragma weak wnga_update55_ghosts = pnga_update55_ghosts
2463
logical pnga_update55_ghosts(Integer g_a)
2465
Integer idx, i, np, handle=GA_OFFSET + g_a, proc_rem;
2466
Integer size, ndim, nwidth, increment[MAXDIM];
2467
Integer width[MAXDIM];
2468
Integer dims[MAXDIM];
2469
Integer lo_loc[MAXDIM], hi_loc[MAXDIM];
2470
Integer plo_loc[MAXDIM]/*, phi_loc[MAXDIM]*/;
2471
Integer tlo_rem[MAXDIM], thi_rem[MAXDIM];
2472
Integer slo_rem[MAXDIM], shi_rem[MAXDIM];
2473
Integer plo_rem[MAXDIM], phi_rem[MAXDIM];
2474
Integer ld_loc[MAXDIM], ld_rem[MAXDIM];
2475
int stride_loc[MAXDIM], stride_rem[MAXDIM],count[MAXDIM];
2477
char *ptr_loc, *ptr_rem;
2478
Integer me = pnga_nodeid();
2481
/* This routine makes use of the shift algorithm to update data in the
2482
* ghost cells bounding the local block of visible data. The shift
2483
* algorithm starts by updating the blocks of data along the first
2484
* dimension by grabbing a block of data that is width[0] deep but
2485
* otherwise matches the dimensions of the data residing on the
2486
* calling processor. The update of the second dimension, however,
2487
* grabs a block that is width[1] deep in the second dimension but is
2488
* ldim0 + 2*width[0] in the first dimensions where ldim0 is the
2489
* size of the visible data along the first dimension. The remaining
2490
* dimensions are left the same. For the next update, the width of the
2491
* second dimension is also increased by 2*width[1] and so on. This
2492
* algorith makes use of the fact that data for the dimensions that
2493
* have already been updated is available on each processor and can be
2494
* used in the updates of subsequent dimensions. The total number of
2495
* separate updates is 2*ndim, an update in the negative and positive
2496
* directions for each dimension.
2498
* This operation is implemented using put calls to place the
2499
* appropriate data on remote processors. To signal the remote
2500
* processor that it has received the data, a second put call
2501
* consisting of a single integer is sent after the first put call and
2502
* used to update a signal buffer on the remote processor. Each
2503
* processor can determine how much data it has received by checking
2504
* its signal buffer.
2506
* To perform the update, this routine makes use of several copies of
2507
* indices marking the upper and lower limits of data. Indices
2508
* beginning with the character "p" are relative indices marking the
2509
* location of the data set relative to the origin the local patch of
2510
* the global array, all other indices are in absolute coordinates and
2511
* mark locations in the total global array. The indices used by this
2512
* routine are described below.
2514
* lo_loc[], hi_loc[]: The lower and upper indices of the visible
2515
* block of data held by the calling processor.
2517
* lo_rem[], hi_rem[]: The lower and upper indices of the block
2518
* of data on a remote processor or processors that is needed to
2519
* fill in the calling processors ghost cells. These indices are
2520
* NOT corrected for wrap-around (periodic) boundary conditions
2521
* so they can be negative or greater than the array dimension
2522
* values held in dims[].
2524
* slo_rem[], shi_rem[]: Similar to lo_rem[] and hi_rem[], except
2525
* that these indices have been corrected for wrap-around
2526
* boundary conditions.
2528
* thi_rem[], thi_rem[]: The lower and upper indices of the visible
2529
* data on a remote processor.
2531
* plo_loc[], phi_loc[]: The indices of the local data patch that
2532
* is going to be updated.
2534
* plo_rem[], phi_rem[]: The indices of the data patch on the
2535
* remote processor that will be used to update the data on the
2536
* calling processor. Note that the dimensions of the patches
2537
* represented by plo_loc[], plo_rem[] and plo_loc[], phi_loc[]
2541
/* if global array has no ghost cells, just return */
2542
if (!pnga_has_ghosts(g_a)) return TRUE;
2544
size = GA[handle].elemsize;
2545
ndim = GA[handle].ndim;
2546
p_handle = GA[handle].p_handle;
2548
/* initialize range increments and get array dimensions */
2549
for (idx=0; idx < ndim; idx++) {
2551
width[idx] = (Integer)GA[handle].width[idx];
2552
dims[idx] = (Integer)GA[handle].dims[idx];
2553
if (lo_loc[idx] == 0 && hi_loc[idx] == -1) return FALSE;
2556
/* Check to make sure that global array is well-behaved (all processors
2557
have data and the width of the data in each dimension is greater
2558
than the corresponding value in width[]. */
2559
if (!gai_check_ghost_distr(g_a)) return FALSE;
2561
GA_PUSH_NAME("ga_update55_ghosts");
2563
/* Get pointer to local memory */
2564
ptr_loc = GA[handle].ptr[GAme];
2565
/* obtain range of data that is held by local processor */
2566
pnga_distribution(g_a,me,lo_loc,hi_loc);
2568
/* loop over dimensions for sequential update using shift algorithm */
2570
(*GA_Update_Signal) = 1;
2571
for (idx=0; idx < ndim; idx++) {
2572
nwidth = width[idx];
2574
/* Do not bother with update if nwidth is zero */
2577
/* Perform update in negative direction. */
2578
get_remote_block_neg(idx, ndim, lo_loc, hi_loc, slo_rem, shi_rem,
2580
/* locate processor with this data */
2581
if (!pnga_locate_region(g_a, slo_rem, shi_rem, _ga_map,
2582
GA_proclist, &np)) ga_RegionError(pnga_ndim(g_a),
2583
slo_rem, shi_rem, g_a);
2585
/* Get actual coordinates of desired location of remote
2586
data as well as the actual coordinates of the local chunk
2587
of data that will be sent to remote processor (these
2588
coordinates take into account the presence of ghost
2589
cells). Start by finding out what data is actually held by
2590
remote processor. */
2591
proc_rem = GA_proclist[0];
2592
pnga_distribution(g_a, proc_rem, tlo_rem, thi_rem);
2593
for (i = 0; i < ndim; i++) {
2594
if (increment[i] == 0) {
2596
plo_rem[i] = thi_rem[i] - tlo_rem[i] + width[i] + 1;
2597
phi_rem[i] = thi_rem[i] - tlo_rem[i] + 2*width[i];
2598
plo_loc[i] = width[i];
2599
/*phi_loc[i] = 2*width[i] - 1;*/
2601
plo_rem[i] = width[i];
2602
phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];
2603
plo_loc[i] = width[i];
2604
/*phi_loc[i] = hi_loc[i] - lo_loc[i] + width[i];*/
2608
phi_rem[i] = thi_rem[i] - tlo_rem[i] + increment[i];
2610
/*phi_loc[i] = hi_loc[i] - lo_loc[i] + increment[i];*/
2614
/* Get pointer to local data buffer and remote data
2615
buffer as well as lists of leading dimenstions */
2616
gam_LocationWithGhosts(me, handle, plo_loc, &ptr_loc, ld_loc);
2617
gam_LocationWithGhosts(proc_rem, handle, plo_rem, &ptr_rem, ld_rem);
2619
/* Evaluate strides on local and remote processors */
2620
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
2623
/* Compute the number of elements in each dimension and store
2624
result in count. Scale the first element in count by the
2626
gam_ComputeCount(ndim, plo_rem, phi_rem, count);
2629
/* Put local data on remote processor */
2630
if (p_handle >= 0) {
2631
proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem];
2634
ARMCI_PutS(ptr_loc, stride_loc, ptr_rem, stride_rem, count, ndim- 1, proc_rem);
2635
/* Send signal to remote processor that data transfer has been completed. */
2636
bytes = sizeof(int);
2637
ARMCI_Put(GA_Update_Signal, GA_Update_Flags[proc_rem]+msgcnt, bytes, proc_rem);
2639
ARMCI_PutS_flag(ptr_loc, stride_loc, ptr_rem, stride_rem, count,
2640
(int)(ndim - 1), GA_Update_Flags[proc_rem]+msgcnt,
2641
*GA_Update_Signal, (int)proc_rem);
2645
/* Perform update in positive direction. */
2646
get_remote_block_pos(idx, ndim, lo_loc, hi_loc, slo_rem, shi_rem,
2648
/* locate processor with this data */
2649
if (!pnga_locate_region(g_a, slo_rem, shi_rem, _ga_map,
2650
GA_proclist, &np)) ga_RegionError(pnga_ndim(g_a),
2651
slo_rem, shi_rem, g_a);
2653
/* Get actual coordinates of desired chunk of remote
2654
data as well as the actual coordinates of the local chunk
2655
of data that will receive the remote data (these
2656
coordinates take into account the presence of ghost
2657
cells). Start by finding out what data is actually held by
2658
remote processor. */
2659
proc_rem = GA_proclist[0];
2660
pnga_distribution(g_a, proc_rem, tlo_rem, thi_rem);
2661
for (i = 0; i < ndim; i++) {
2662
if (increment[i] == 0) {
2665
phi_rem[i] = width[i] - 1;
2666
plo_loc[i] = hi_loc[i] - lo_loc[i] + width[i] - 1;
2667
/*phi_loc[i] = hi_loc[i] - lo_loc[i] + 2*width[i] - 1;*/
2669
plo_rem[i] = width[i];
2670
phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];
2671
plo_loc[i] = width[i];
2672
/*phi_loc[i] = hi_loc[i] - lo_loc[i] + width[i];*/
2676
phi_rem[i] = thi_rem[i] - tlo_rem[i] + increment[i];
2678
/*phi_loc[i] = hi_loc[i] - lo_loc[i] + increment[i];*/
2682
/* Get pointer to local data buffer and remote data
2683
buffer as well as lists of leading dimenstions */
2684
gam_LocationWithGhosts(GAme, handle, plo_loc, &ptr_loc, ld_loc);
2685
gam_LocationWithGhosts(proc_rem, handle, plo_rem, &ptr_rem, ld_rem);
2687
/* Evaluate strides on local and remote processors */
2688
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
2691
/* Compute the number of elements in each dimension and store
2692
result in count. Scale the first element in count by the
2694
gam_ComputeCount(ndim, plo_rem, phi_rem, count);
2697
/* Put local data on remote processor */
2698
if (p_handle >= 0) {
2699
proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem];
2702
ARMCI_PutS(ptr_loc, stride_loc, ptr_rem, stride_rem, count, ndim- 1, proc_rem);
2703
/* Send signal to remote processor that data transfer has been completed. */
2704
bytes = sizeof(int);
2705
ARMCI_Put(GA_Update_Signal, GA_Update_Flags[proc_rem]+msgcnt, bytes, proc_rem);
2708
ARMCI_PutS_flag(ptr_loc, stride_loc, ptr_rem, stride_rem, count,
2709
(int)(ndim - 1), GA_Update_Flags[proc_rem]+msgcnt,
2710
*GA_Update_Signal, (int)proc_rem);
2714
/* check to make sure that all messages have been recieved before
2715
starting update along new dimension */
2716
waitforflags((GA_Update_Flags[GAme]+msgcnt-2),
2717
(GA_Update_Flags[GAme]+msgcnt-1));
2718
/* update increment array */
2719
increment[idx] = 2*nwidth;
2722
/* set GA_Update_Flags array to zero for next update operation. */
2723
for (idx=0; idx < 2*ndim; idx++) {
2724
GA_Update_Flags[GAme][idx] = 0;
2731
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY ALONG ONE SIDE OF ARRAY
2733
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
2734
# pragma weak wnga_update_ghost_dir = pnga_update_ghost_dir
2736
logical pnga_update_ghost_dir(Integer g_a, /* GA handle */
2737
Integer pdim, /* Dimension of update */
2738
Integer pdir, /* Direction of update (+/-1) */
2739
logical pflag) /* include corner cells */
2741
Integer idx, ipx, inx, np, handle=GA_OFFSET + g_a, proc_rem;
2742
Integer ntot, mask[MAXDIM],lmask[MAXDIM];
2743
Integer size, ndim, i, itmp, idim, idir;
2744
Integer width[MAXDIM], dims[MAXDIM];
2745
Integer lo_loc[MAXDIM], hi_loc[MAXDIM];
2746
Integer plo_loc[MAXDIM], phi_loc[MAXDIM];
2747
Integer tlo_rem[MAXDIM], thi_rem[MAXDIM];
2748
Integer plo_rem[MAXDIM]/*, phi_rem[MAXDIM]*/;
2749
Integer ld_loc[MAXDIM], ld_rem[MAXDIM];
2751
int stride_loc[MAXDIM], stride_rem[MAXDIM],count[MAXDIM];
2752
char *ptr_loc, *ptr_rem;
2753
Integer me = pnga_nodeid();
2756
int local_sync_begin,local_sync_end;
2758
local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2759
_ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2761
/* if global array has no ghost cells, just return */
2762
if (!pnga_has_ghosts(g_a))
2765
if(local_sync_begin)pnga_sync();
2770
size = GA[handle].elemsize;
2771
ndim = GA[handle].ndim;
2772
p_handle = GA[handle].p_handle;
2773
/* initialize ghost cell widths and get array dimensions */
2774
for (idx=0; idx < ndim; idx++) {
2775
width[idx] = (Integer)GA[handle].width[idx];
2776
dims[idx] = (Integer)GA[handle].dims[idx];
2779
/* Check to make sure that global array is well-behaved (all processors
2780
have data and the width of the data in each dimension is greater than
2781
the corresponding value in width[]). */
2783
for (idx = 0; idx < ndim; idx++) {
2784
for (np = 0; np < GA[handle].nblock[idx]; np++) {
2785
if (np < GA[handle].nblock[idx] - 1) {
2786
if (GA[handle].mapc[ipx+1]-GA[handle].mapc[ipx]+1<width[idx]) {
2790
if (GA[handle].dims[idx]-GA[handle].mapc[ipx]+1<width[idx]) {
2798
GA_PUSH_NAME("nga_update_ghost_dir");
2799
/* Get pointer to local memory */
2800
ptr_loc = GA[handle].ptr[GAme];
2801
/* obtain range of data that is held by local processor */
2802
pnga_distribution(g_a,me,lo_loc,hi_loc);
2804
/* evaluate total number of GET operations */
2807
for (idx=0; idx < ndim-1; idx++) ntot *= 3;
2810
/* Loop over all GET operations. */
2811
for (ipx=0; ipx < ntot; ipx++) {
2812
/* Convert ipx to corresponding mask values */
2815
for (idx = 0; idx < ndim-1; idx++) {
2821
for (idx = 0; idx < ndim-1; idx++) lmask[idx] = 0;
2824
for (idx = 0; idx < ndim; idx++) {
2825
if (idx == idim-1) {
2828
mask[idx] = lmask[inx];
2832
/* Now that mask has been determined, find processor that contains
2833
* data needed by the corresponding block of ghost cells */
2834
for (idx = 0; idx < ndim; idx++) {
2835
if (mask[idx] == 0) {
2836
tlo_rem[idx] = lo_loc[idx];
2837
thi_rem[idx] = hi_loc[idx];
2838
} else if (mask[idx] == -1) {
2839
if (lo_loc[idx] > 1) {
2840
tlo_rem[idx] = lo_loc[idx]-width[idx];
2841
thi_rem[idx] = lo_loc[idx]-1;
2843
tlo_rem[idx] = dims[idx]-width[idx]+1;
2844
thi_rem[idx] = dims[idx];
2846
} else if (mask[idx] == 1) {
2847
if (hi_loc[idx] < dims[idx]) {
2848
tlo_rem[idx] = hi_loc[idx] + 1;
2849
thi_rem[idx] = hi_loc[idx] + width[idx];
2852
thi_rem[idx] = width[idx];
2855
fprintf(stderr,"Illegal mask value found\n");
2858
/* Locate remote processor to which data must be sent */
2859
if (!pnga_locate_region(g_a, tlo_rem, thi_rem, _ga_map,
2860
GA_proclist, &np)) ga_RegionError(pnga_ndim(g_a),
2861
tlo_rem, thi_rem, g_a);
2863
fprintf(stderr,"More than one remote processor found\n");
2865
/* Remote processor has been identified, now get ready to get
2866
data from it. Start by getting distribution on remote
2868
proc_rem = GA_proclist[0];
2869
pnga_distribution(g_a, proc_rem, tlo_rem, thi_rem);
2870
for (idx = 0; idx < ndim; idx++) {
2871
if (mask[idx] == 0) {
2872
plo_loc[idx] = width[idx];
2873
phi_loc[idx] = hi_loc[idx]-lo_loc[idx]+width[idx];
2874
plo_rem[idx] = plo_loc[idx];
2875
/*phi_rem[idx] = phi_loc[idx];*/
2876
} else if (mask[idx] == -1) {
2878
phi_loc[idx] = width[idx]-1;
2879
plo_rem[idx] = thi_rem[idx]-tlo_rem[idx]+1;
2880
/*phi_rem[idx] = thi_rem[idx]-tlo_rem[idx]+width[idx];*/
2881
} else if (mask[idx] == 1) {
2882
plo_loc[idx] = hi_loc[idx]-lo_loc[idx]+width[idx]+1;
2883
phi_loc[idx] = hi_loc[idx]-lo_loc[idx]+2*width[idx];
2884
plo_rem[idx] = width[idx];
2885
/*phi_rem[idx] = 2*width[idx]-1;*/
2888
/* Get pointer to local data buffer and remote data
2889
buffer as well as lists of leading dimenstions */
2890
gam_LocationWithGhosts(me, handle, plo_loc, &ptr_loc, ld_loc);
2891
gam_LocationWithGhosts(proc_rem, handle, plo_rem, &ptr_rem, ld_rem);
2893
/* Evaluate strides on local and remote processors */
2894
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
2897
/* Compute the number of elements in each dimension and store
2898
result in count. Scale the first element in count by the
2900
gam_ComputeCount(ndim, plo_loc, phi_loc, count);
2903
/* get data from remote processor */
2904
if (p_handle >= 0) {
2905
proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem];
2907
ARMCI_GetS(ptr_rem, stride_rem, ptr_loc, stride_loc, count,
2908
(int)(ndim - 1), (int)proc_rem);
2912
if(local_sync_end)pnga_sync();
2916
/*uncomment for using message passing sendrecv in north south direction */
2917
/*#define USE_MP_NORTHSOUTH */
2920
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY USING PUT CALLS WITHOUT CORNERS AND
2921
* WITHOUT ANY BARRIERS
2923
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
2924
# pragma weak wnga_update5_ghosts = pnga_update5_ghosts
2926
logical pnga_update5_ghosts(Integer g_a)
2928
Integer idx, i, handle=GA_OFFSET + g_a;
2929
Integer /*size,*/ ndim, nwidth;
2930
Integer width[MAXDIM];
2931
Integer* proc_rem_ptr;
2932
int *stride_loc, *stride_rem,*count;
2933
int msgcnt, corner_flag, proc_rem;
2935
char *ptr_loc, *ptr_rem,*cache;
2936
int local_sync_begin,local_sync_end;
2938
#ifdef USE_MP_NORTHSOUTH
2939
char send_name[32], rcv_name[32];
2940
void *snd_ptr, *rcv_ptr;
2942
/* This routine makes use of the shift algorithm to update data in the
2943
* ghost cells bounding the local block of visible data. The shift
2944
* algorithm starts by updating the blocks of data along the first
2945
* dimension by grabbing a block of data that is width[0] deep but
2946
* otherwise matches the dimensions of the data residing on the
2947
* calling processor. The update of the second dimension, however,
2948
* grabs a block that is width[1] deep in the second dimension but is
2949
* ldim0 + 2*width[0] in the first dimensions where ldim0 is the
2950
* size of the visible data along the first dimension. The remaining
2951
* dimensions are left the same. For the next update, the width of the
2952
* second dimension is also increased by 2*width[1] and so on. This
2953
* algorith makes use of the fact that data for the dimensions that
2954
* have already been updated is available on each processor and can be
2955
* used in the updates of subsequent dimensions. The total number of
2956
* separate updates is 2*ndim, an update in the negative and positive
2957
* directions for each dimension.
2959
* This operation is implemented using put calls to place the
2960
* appropriate data on remote processors. To signal the remote
2961
* processor that it has received the data, a second put call
2962
* consisting of a single integer is sent after the first put call and
2963
* used to update a signal buffer on the remote processor. Each
2964
* processor can determine how much data it has received by checking
2965
* its signal buffer.
2968
local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2969
_ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2970
if(local_sync_begin)pnga_sync();
2972
#ifdef USE_MP_NORTHSOUTH
2973
strcpy(send_name,"send_buffer");
2974
strcpy(rcv_name,"receive_buffer");
2976
snd_ptr = ga_malloc(buflen, GA[handle].type, send_name);
2977
rcv_ptr = ga_malloc(buflen, GA[handle].type, rcv_name);
2980
cache = (char *)GA[handle].cache;
2981
/* if global array has no ghost cells, just return */
2982
if (!pnga_has_ghosts(g_a)) return TRUE;
2984
/*size = GA[handle].elemsize;*/
2985
ndim = GA[handle].ndim;
2986
p_handle = GA[handle].p_handle;
2987
for (i=0; i<ndim; i++) {
2988
width[i] = (Integer)GA[handle].width[i];
2991
if (!gai_check_ghost_distr(g_a)) return FALSE;
2993
GA_PUSH_NAME("pnga_update5_ghosts");
2995
/* loop over dimensions for sequential update using shift algorithm */
2997
corner_flag = GA[handle].corner_flag;
2998
(*GA_Update_Signal) = 1;
2999
for (idx=0; idx < ndim; idx++) {
3000
nwidth = width[idx];
3003
/* Perform update in negative direction. */
3004
ptr_rem = *(char **)(cache);
3005
if(ptr_rem==NULL) return FALSE;
3006
ptr_loc = *(char **)(cache+sizeof(char *));
3007
stride_loc = (int *)(cache+2*sizeof(char *));
3008
stride_rem = (int *)(stride_loc+ndim);
3009
count = (int *)(stride_rem+ndim);
3010
proc_rem_ptr = (Integer *)(count+ndim);
3011
proc_rem = (int)(*proc_rem_ptr);
3012
cache = (char *)(proc_rem_ptr+1);
3014
if (p_handle >= 0) {
3015
proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem];
3017
if(count[0]>1000000){
3018
/*tries to use armci direct put when possible */
3019
ARMCI_PutS_flag(ptr_loc, stride_loc, ptr_rem, stride_rem, count,
3020
(int)(ndim - 1), GA_Update_Flags[proc_rem]+msgcnt,
3021
*GA_Update_Signal, proc_rem);
3024
#ifndef USE_MP_NORTHSOUTH
3025
ARMCI_PutS_flag(ptr_loc, stride_loc, ptr_rem, stride_rem, count,
3026
(int)(ndim - 1), GA_Update_Flags[proc_rem]+msgcnt,
3027
*GA_Update_Signal, proc_rem);
3034
/* Perform update in positive direction. */
3035
ptr_rem = *(char **)(cache);
3036
ptr_loc = *(char **)(cache+sizeof(char *));
3037
stride_loc = (int *)(cache+2*sizeof(char *));
3038
stride_rem = (int *)(stride_loc+ndim);
3039
count = (int *)(stride_rem+ndim);
3040
proc_rem_ptr = (Integer *)(count+ndim);
3041
proc_rem = (int)(*proc_rem_ptr);
3042
cache = (char *)(proc_rem_ptr+1);
3044
if (p_handle >= 0) {
3045
proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem];
3047
if(count[0]>1000000){
3048
/*tries to use armci direct put when possible */
3049
ARMCI_PutS_flag(ptr_loc, stride_loc, ptr_rem, stride_rem, count,
3050
(int)(ndim - 1), GA_Update_Flags[proc_rem]+msgcnt,
3051
*GA_Update_Signal, proc_rem);
3054
#ifndef USE_MP_NORTHSOUTH
3055
ARMCI_PutS_flag(ptr_loc, stride_loc, ptr_rem, stride_rem, count,
3056
(int)(ndim - 1), GA_Update_Flags[proc_rem]+msgcnt,
3057
*GA_Update_Signal, proc_rem);
3066
/* check to make sure that last two messages have been recieved
3067
before starting update along a new dimension */
3068
waitforflags((GA_Update_Flags[GAme]+msgcnt-2),
3069
(GA_Update_Flags[GAme]+msgcnt-1));
3070
GA_Update_Flags[GAme][msgcnt-1]=0;
3071
GA_Update_Flags[GAme][msgcnt-2]=0;
3077
/* check to make sure that all messages have been recieved */
3079
waitforflags((GA_Update_Flags[GAme]+msgcnt-1),
3080
(GA_Update_Flags[GAme]+msgcnt-2));
3081
GA_Update_Flags[GAme][msgcnt-1]=0;
3082
GA_Update_Flags[GAme][msgcnt-2]=0;
3088
if(local_sync_end)pnga_sync();
3092
/*#define UPDATE_SAMENODE_GHOSTS_FIRST*/
3094
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
3095
# pragma weak wnga_set_update5_info = pnga_set_update5_info
3097
logical pnga_set_update5_info(Integer g_a)
3101
Integer size, ndim, nwidth, increment[MAXDIM],np;
3102
Integer width[MAXDIM];
3103
Integer dims[MAXDIM];
3104
Integer lo_loc[MAXDIM], hi_loc[MAXDIM];
3105
Integer plo_loc[MAXDIM]/*, phi_loc[MAXDIM]*/;
3106
Integer tlo_rem[MAXDIM], thi_rem[MAXDIM];
3107
Integer slo_rem[MAXDIM], shi_rem[MAXDIM];
3108
Integer plo_rem[MAXDIM], phi_rem[MAXDIM];
3109
Integer ld_loc[MAXDIM], ld_rem[MAXDIM];
3110
int *stride_loc, *stride_rem,*count;
3111
int idx, corner_flag;
3112
char **ptr_loc, **ptr_rem,*cache;
3113
Integer handle = GA_OFFSET + g_a;
3115
#ifdef UPDATE_SAMENODE_GHOSTS_FIRST
3118
Integer me = pnga_nodeid();
3121
/* This routine sets up the arrays that are used to transfer data
3122
* using the update5 algorithm. The arrays begining with the character
3123
* "p" represent relative indices marking the location of the data set
3124
* relative to the origin the local patch of the global array, all
3125
* other indices are in absolute coordinates and mark locations in the
3126
* total global array. The indices used by this routine are described
3129
* lo_loc[], hi_loc[]: The lower and upper indices of the visible
3130
* block of data held by the calling processor.
3132
* lo_rem[], hi_rem[]: The lower and upper indices of the block
3133
* of data on a remote processor or processors that is needed to
3134
* fill in the calling processors ghost cells. These indices are
3135
* NOT corrected for wrap-around (periodic) boundary conditions
3136
* so they can be negative or greater than the array dimension
3137
* values held in dims[].
3139
* slo_rem[], shi_rem[]: Similar to lo_rem[] and hi_rem[], except
3140
* that these indices have been corrected for wrap-around
3141
* boundary conditions.
3143
* thi_rem[], thi_rem[]: The lower and upper indices of the visible
3144
* data on a remote processor.
3146
* plo_loc[], phi_loc[]: The indices of the local data patch that
3147
* is going to be updated.
3149
* plo_rem[], phi_rem[]: The indices of the data patch on the
3150
* remote processor that will be used to update the data on the
3151
* calling processor. Note that the dimensions of the patches
3152
* represented by plo_loc[], plo_rem[] and plo_loc[], phi_loc[]
3156
/* if global array has no ghost cells, just return */
3157
if (!pnga_has_ghosts(g_a)) return TRUE;
3159
/* Check to make sure that global array is well-behaved (all processors
3160
have data and the width of the data in each dimension is greater
3161
than the corresponding value in width[]. */
3162
if (!gai_check_ghost_distr(g_a)) return FALSE;
3164
ndim = GA[handle].ndim;
3165
p_handle = GA[handle].p_handle;
3166
size = GA[handle].elemsize;
3167
cache_size = 2*sizeof(char *)+3*sizeof(int)+sizeof(Integer);
3168
cache_size = 2*ndim*((cache_size/sizeof(double)) + 1);
3169
GA[handle].cache = (double *)malloc(sizeof(double)*cache_size);
3170
cache = (char *)GA[handle].cache;
3171
corner_flag = GA[handle].corner_flag;
3173
pnga_distribution(g_a,me,lo_loc,hi_loc);
3174
for (idx=0; idx < ndim; idx++) {
3176
width[idx] = (Integer)GA[handle].width[idx];
3177
dims[idx] = (Integer)GA[handle].dims[idx];
3178
if (lo_loc[idx] == 0 && hi_loc[idx] == -1){
3179
*(char **)cache = NULL;
3183
#ifdef UPDATE_SAMENODE_GHOSTS_FIRST
3184
for(scope=0;scope < 2; scope ++)
3186
for (idx=0; idx < ndim; idx++) {
3187
nwidth = width[idx];
3190
ptr_rem = (char **)cache;
3191
ptr_loc = (char **)(cache+sizeof(char *));
3192
stride_loc = (int *)(cache+2*sizeof(char *));
3193
stride_rem = (int *)(stride_loc+ndim);
3194
count = (int *)(stride_rem+ndim);
3195
proc_rem = (Integer *)(count+ndim);
3197
get_remote_block_neg(idx, ndim, lo_loc, hi_loc, slo_rem, shi_rem,
3199
if (!pnga_locate_region(g_a, slo_rem, shi_rem, _ga_map,
3200
GA_proclist, &np)) ga_RegionError(pnga_ndim(g_a),
3201
slo_rem, shi_rem, g_a);
3203
*proc_rem = (Integer)GA_proclist[0];
3204
if (p_handle >= 0) {
3205
*proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[*proc_rem];
3208
#ifdef UPDATE_SAMENODE_GHOSTS_FIRST
3209
if(scope == 0 && ARMCI_Same_node(*proc_rem))
3213
cache = (char *)(proc_rem+1);
3215
pnga_distribution(g_a, *proc_rem, tlo_rem, thi_rem);
3218
for (i = 0; i < ndim; i++) {
3219
if (increment[i] == 0) {
3221
plo_rem[i] = thi_rem[i] - tlo_rem[i] + width[i] + 1;
3222
phi_rem[i] = thi_rem[i] - tlo_rem[i] + 2*width[i];
3223
plo_loc[i] = width[i];
3224
/*phi_loc[i] = 2*width[i] - 1;*/
3226
plo_rem[i] = width[i];
3227
phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];
3228
plo_loc[i] = width[i];
3229
/*phi_loc[i] = hi_loc[i] - lo_loc[i] + width[i];*/
3233
phi_rem[i] = thi_rem[i] - tlo_rem[i] + increment[i];
3235
/*phi_loc[i] = hi_loc[i] - lo_loc[i] + increment[i];*/
3238
gam_LocationWithGhosts(me, handle, plo_loc, ptr_loc, ld_loc);
3239
gam_LocationWithGhosts(*proc_rem, handle, plo_rem, ptr_rem, ld_rem);
3241
/* Evaluate strides on local and remote processors */
3242
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
3244
gam_ComputeCount(ndim, plo_rem, phi_rem, count);
3246
if (p_handle >= 0) {
3247
*proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[*proc_rem];
3250
#ifdef UPDATE_SAMENODE_GHOSTS_FIRST
3254
/*BJP proc_rem++; */
3255
ptr_rem = (char **)cache;
3256
ptr_loc = (char **)(cache+sizeof(char *));
3257
stride_loc = (int *)(cache+2*sizeof(char *));
3258
stride_rem = (int *)(stride_loc+ndim);
3259
count = (int *)(stride_rem+ndim);
3260
proc_rem = (Integer *)(count+ndim);
3262
get_remote_block_pos(idx, ndim, lo_loc, hi_loc, slo_rem, shi_rem,
3265
if (!pnga_locate_region(g_a, slo_rem, shi_rem, _ga_map,
3266
GA_proclist, &np)) ga_RegionError(pnga_ndim(g_a),
3267
slo_rem, shi_rem, g_a);
3269
*proc_rem = (Integer)GA_proclist[0];
3270
if (p_handle >= 0) {
3271
*proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[*proc_rem];
3274
#ifdef UPDATE_SAMENODE_GHOSTS_FIRST
3275
if(scope == 0 && ARMCI_Same_node(*proc_rem))
3279
cache = (char *)(proc_rem+1);
3281
pnga_distribution(g_a, *proc_rem, tlo_rem, thi_rem);
3285
for (i = 0; i < ndim; i++) {
3286
if (increment[i] == 0) {
3289
phi_rem[i] = width[i] - 1;
3290
plo_loc[i] = hi_loc[i] - lo_loc[i] + width[i] - 1;
3291
/*phi_loc[i] = hi_loc[i] - lo_loc[i] + 2*width[i] - 1;*/
3293
plo_rem[i] = width[i];
3294
phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];
3295
plo_loc[i] = width[i];
3296
/*phi_loc[i] = hi_loc[i] - lo_loc[i] + width[i];*/
3300
phi_rem[i] = thi_rem[i] - tlo_rem[i] + increment[i];
3302
/*phi_loc[i] = hi_loc[i] - lo_loc[i] + increment[i];*/
3307
gam_LocationWithGhosts(GAme, handle, plo_loc, ptr_loc, ld_loc);
3308
gam_LocationWithGhosts(*proc_rem, handle, plo_rem, ptr_rem, ld_rem);
3310
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
3313
gam_ComputeCount(ndim, plo_rem, phi_rem, count);
3315
if (p_handle >= 0) {
3316
*proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[*proc_rem];
3320
increment[idx] = 2*nwidth;
3326
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY USING SHIFT ALGORITHM
3328
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
3329
# pragma weak wnga_update_ghosts = pnga_update_ghosts
3331
void pnga_update_ghosts(Integer g_a)
3333
/* Wrapper program for ghost cell update operations. If optimized
3334
update operation fails then use slow but robust version of
3336
int local_sync_begin,local_sync_end;
3338
local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
3339
_ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
3340
if(local_sync_begin)pnga_sync();
3343
if (!pnga_update5_ghosts(g_a))
3345
if (!pnga_update4_ghosts(g_a))
3348
pnga_update1_ghosts(g_a);
3351
if(local_sync_end)pnga_sync();
3354
/* Utility function for ga_update6_ghosts routine */
3355
static double waitformixedflags (int flag1, int flag2, int *ptr1, int *ptr2) {
3358
while ((flag1 && *ptr1 == 0) ||
3359
(flag2 && *ptr2 == 0)) {
3360
val = exp(-(double)i++);
3365
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY USING SHIFT ALGORITHM AND
3368
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
3369
# pragma weak wnga_update6_ghosts = pnga_update6_ghosts
3371
logical pnga_update6_ghosts(Integer g_a)
3373
Integer idx, idir, i, np, handle=GA_OFFSET + g_a;
3374
Integer size, buflen, buftot, bufsize, ndim, increment[MAXDIM];
3375
Integer proc_rem_snd, proc_rem_rcv, pmax;
3376
Integer msgcnt, length;
3377
Integer width[MAXDIM], dims[MAXDIM], index[MAXDIM];
3378
Integer lo_loc[MAXDIM], hi_loc[MAXDIM];
3379
Integer plo_rem[MAXDIM]/*, phi_rem[MAXDIM]*/;
3380
Integer tlo_rem[MAXDIM], thi_rem[MAXDIM];
3381
Integer plo_snd[MAXDIM], phi_snd[MAXDIM];
3382
Integer lo_rcv[MAXDIM], hi_rcv[MAXDIM];
3383
Integer slo_rcv[MAXDIM], shi_rcv[MAXDIM];
3384
Integer plo_rcv[MAXDIM], phi_rcv[MAXDIM];
3385
Integer ld_loc[MAXDIM], ld_rem[MAXDIM];
3387
int stride_snd[MAXDIM], stride_rcv[MAXDIM],count[MAXDIM];
3388
int stride_rem[MAXDIM];
3389
int flag1=0, flag2=0, sprocflag, rprocflag;
3390
char *ptr_snd, *ptr_rcv;
3391
char /* *ptr_loc,*/ *ptr_rem;
3392
char send_name[32], rcv_name[32];
3393
void *snd_ptr, *rcv_ptr, *snd_ptr_orig, *rcv_ptr_orig;
3394
Integer me = pnga_nodeid();
3395
Integer p_handle, wproc;
3397
/* This routine makes use of the shift algorithm to update data in the
3398
* ghost cells bounding the local block of visible data. The shift
3399
* algorithm starts by updating the blocks of data along the first
3400
* dimension by grabbing a block of data that is width[0] deep but
3401
* otherwise matches the dimensions of the data residing on the
3402
* calling processor. The update of the second dimension, however,
3403
* grabs a block that is width[1] deep in the second dimension but is
3404
* ldim0 + 2*width[0] in the first dimensions where ldim0 is the
3405
* size of the visible data along the first dimension. The remaining
3406
* dimensions are left the same. For the next update, the width of the
3407
* second dimension is also increased by 2*width[1] and so on. This
3408
* algorith makes use of the fact that data for the dimensions that
3409
* have already been updated is available on each processor and can be
3410
* used in the updates of subsequent dimensions. The total number of
3411
* separate updates is 2*ndim, an update in the negative and positive
3412
* directions for each dimension.
3414
* This implementation make use of a combination of explicit message
3415
* passing between processors on different nodes and shared memory
3416
* copies with an additional flag between processors on the same node
3417
* to perform the update. Separate message types for the messages and
3418
* the use of the additional flag are for the updates in each
3419
* coordinate direction are used to maintain synchronization locally
3420
* and to guarantee that the data is present before the updates in a
3421
* new coordinate direction take place.
3423
* To perform the update, this routine makes use of several copies of
3424
* indices marking the upper and lower limits of data. Indices
3425
* beginning with the character "p" are relative indices marking the
3426
* location of the data set relative to the origin the local patch of
3427
* the global array, all other indices are in absolute coordinates and
3428
* mark locations in the total global array. The indices used by this
3429
* routine are described below.
3431
* lo_loc[], hi_loc[]: The lower and upper indices of the visible
3432
* block of data held by the calling processor.
3434
* lo_rcv[], hi_rcv[]: The lower and upper indices of the blocks
3435
* of data that will be either sent to or received from a remote
3436
* processor. These indices are NOT corrected for wrap-around
3437
* (periodic) boundary conditions so they can be negative or greater
3438
* than the array dimension values held in dims[].
3440
* slo_rcv[], shi_rcv[]: Similar to lo_rcv[] and hi_rcv[], except
3441
* that these indices have been corrected for wrap-around
3442
* boundary conditions.
3444
* plo_rcv[], phi_rcv[]: The local indices of the local data patch
3445
* that receive that message from the remote processor.
3447
* plo_snd[], phi_snd[]: The local indices of the data patch
3448
* that will be sent to the remote processor. Note that the
3449
* dimensions of the patches represented by plo_rec[], plo_rec[] and
3450
* plo_snd[], phi_snd[] must be the same.
3452
* tlo_rem[], thi_rem[]: The indices of the locally held visible
3453
* portion of the global array on the remote processor that will be
3454
* receiving the data using a shared memory copy.
3456
* plo_rem[], phi_rem[]: The local indices of the coordinate patch
3457
* that will be put on the remote processor using a shared memory
3461
/* if global array has no ghost cells, just return */
3462
if (!pnga_has_ghosts(g_a)) return TRUE;
3464
size = GA[handle].elemsize;
3465
ndim = GA[handle].ndim;
3466
p_handle = GA[handle].p_handle;
3468
/* initialize range increments and get array dimensions */
3469
for (idx=0; idx < ndim; idx++) {
3471
width[idx] = (Integer)GA[handle].width[idx];
3472
dims[idx] = (Integer)GA[handle].dims[idx];
3475
/* Check to make sure that global array is well-behaved (all processors
3476
have data and the width of the data in each dimension is greater
3477
than the corresponding value in width[]. */
3478
if (!gai_check_ghost_distr(g_a)) return FALSE;
3480
GA_PUSH_NAME("ga_update6_ghosts");
3483
/* Get pointer to local memory */
3484
/*ptr_loc = GA[handle].ptr[me];*/
3485
/* obtain range of data that is held by local processor */
3486
pnga_distribution(g_a,me,lo_loc,hi_loc);
3487
/* Get indices of processor in virtual grid */
3488
pnga_proc_topology(g_a, me, index);
3490
/* Try to find maximum size of message that will be sent during
3491
* update operations and use this to allocate memory for message
3492
* passing buffers. */
3494
for (i=0; i<ndim; i++) {
3495
buftot *= (hi_loc[i]-lo_loc[i] + 1 + 2*width[i]);
3498
for (i = 0; i < ndim; i++) {
3499
idir = hi_loc[i] - lo_loc[i] + 1;
3500
if (buflen < (buftot/(idir + 2*width[i]))*width[i]) {
3501
buflen = (buftot/(idir + 2*width[i]))*width[i];
3504
bufsize = size*buflen;
3505
strcpy(send_name,"send_buffer");
3506
strcpy(rcv_name,"receive_buffer");
3507
snd_ptr_orig = snd_ptr = ga_malloc(buflen, GA[handle].type, send_name);
3508
rcv_ptr_orig = rcv_ptr = ga_malloc(buflen, GA[handle].type, rcv_name);
3510
/* loop over dimensions for sequential update using shift algorithm */
3512
(*GA_Update_Signal) = 1;
3513
for (idx=0; idx < ndim; idx++) {
3515
/* Do not bother with update if nwidth is zero */
3516
if (width[idx] != 0) {
3518
/* Perform update in negative direction. */
3519
get_remote_block_neg(idx, ndim, lo_loc, hi_loc, slo_rcv, shi_rcv,
3521
/* locate processor with this data */
3522
if (!pnga_locate_region(g_a, slo_rcv, shi_rcv, _ga_map,
3523
GA_proclist, &np)) ga_RegionError(pnga_ndim(g_a),
3524
slo_rcv, shi_rcv, g_a);
3525
/* find out if this processor is on the same node */
3526
wproc = GA_proclist[0];
3527
if (p_handle >= 0) {
3528
wproc = PGRP_LIST[p_handle].inv_map_proc_list[wproc];
3530
rprocflag = ARMCI_Same_node(wproc);
3531
proc_rem_snd = GA_proclist[0];
3533
/* Find processor from which data will be received */
3534
for (i = 0; i < ndim; i++) {
3536
lo_rcv[i] = hi_loc[i] + 1;
3537
hi_rcv[i] = hi_loc[i] + width[i];
3539
lo_rcv[i] = lo_loc[i];
3540
hi_rcv[i] = hi_loc[i];
3544
/* Account for boundaries, if necessary. */
3545
for (i=0; i<ndim; i++) {
3547
if (hi_rcv[i] > dims[i]) {
3549
shi_rcv[i] = width[i];
3551
slo_rcv[i] = lo_rcv[i];
3552
shi_rcv[i] = hi_rcv[i];
3555
slo_rcv[i] = lo_rcv[i];
3556
shi_rcv[i] = hi_rcv[i];
3559
/* locate processor with this data */
3560
if (!pnga_locate_region(g_a, slo_rcv, shi_rcv, _ga_map,
3561
GA_proclist, &np)) ga_RegionError(pnga_ndim(g_a),
3562
slo_rcv, shi_rcv, g_a);
3563
wproc = GA_proclist[0];
3564
if (p_handle >= 0) {
3565
wproc = PGRP_LIST[p_handle].inv_map_proc_list[wproc];
3567
sprocflag = ARMCI_Same_node(wproc);
3568
proc_rem_rcv = GA_proclist[0];
3569
pnga_distribution(g_a, proc_rem_rcv, tlo_rem, thi_rem);
3571
/* Get actual coordinates of chunk of data that will be sent to
3572
* remote processor as well as coordinates of the array space that
3573
* will receive data from remote processor. */
3574
for (i = 0; i < ndim; i++) {
3575
if (increment[i] == 0) {
3577
plo_snd[i] = width[i];
3578
phi_snd[i] = 2*width[i] - 1;
3579
plo_rcv[i] = hi_loc[i] - lo_loc[i] + width[i] + 1;
3580
phi_rcv[i] = hi_loc[i] - lo_loc[i] + 2*width[i];
3581
plo_rem[i] = thi_rem[i] - tlo_rem[i] + width[i] + 1;
3582
/*phi_rem[i] = thi_rem[i] - tlo_rem[i] + 2*width[i];*/
3584
plo_snd[i] = width[i];
3585
phi_snd[i] = hi_loc[i] - lo_loc[i] + width[i];
3586
plo_rcv[i] = width[i];
3587
phi_rcv[i] = hi_loc[i] - lo_loc[i] + width[i];
3588
plo_rem[i] = width[i];
3589
/*phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];*/
3593
phi_snd[i] = hi_loc[i] - lo_loc[i] + increment[i];
3595
phi_rcv[i] = hi_loc[i] - lo_loc[i] + increment[i];
3597
/*phi_rem[i] = thi_rem[i] - tlo_rem[i] + increment[i];*/
3601
/* Get pointer to local data buffer and remote data
3602
buffer as well as lists of leading dimenstions */
3603
gam_LocationWithGhosts(me, handle, plo_snd, &ptr_snd, ld_loc);
3604
gam_LocationWithGhosts(me, handle, plo_rcv, &ptr_rcv, ld_loc);
3605
gam_LocationWithGhosts(proc_rem_snd, handle, plo_rem, &ptr_rem, ld_rem);
3607
/* Evaluate strides for send and receive */
3608
gam_setstride(ndim, size, ld_loc, ld_loc, stride_rcv,
3610
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
3613
/* Compute the number of elements in each dimension and store
3614
result in count. Scale the first element in count by the
3616
gam_ComputeCount(ndim, plo_rcv, phi_rcv, count);
3617
gam_CountElems(ndim, plo_snd, phi_snd, &length);
3621
/* If we are sending data to another node, then use message passing */
3623
/* Fill send buffer with data. */
3624
armci_write_strided(ptr_snd, (int)ndim-1, stride_snd, count, snd_ptr);
3627
/* Send Messages. If processor has odd index in direction idx, it
3628
* sends message first, if processor has even index it receives
3629
* message first. Then process is reversed. Also need to account
3630
* for whether or not there are an odd number of processors along
3631
* update direction. */
3633
if (p_handle >= 0) {
3634
proc_rem_snd = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem_snd];
3635
proc_rem_rcv = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem_rcv];
3637
if (GA[handle].nblock[idx]%2 == 0) {
3638
if (index[idx]%2 != 0 && !rprocflag) {
3639
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
3640
} else if (index[idx]%2 == 0 && !sprocflag) {
3641
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
3645
ARMCI_PutS(ptr_snd, stride_snd, ptr_rem, stride_rem, count, ndim- 1,
3647
/* Send signal to remote processor that data transfer has been completed. */
3648
bytes = sizeof(int);
3649
ARMCI_Put(GA_Update_Signal, GA_Update_Flags[proc_rem_snd]+msgcnt, bytes,
3652
ARMCI_PutS_flag(ptr_snd, stride_snd, ptr_rem, stride_rem, count,
3653
(int)(ndim-1), GA_Update_Flags[proc_rem_snd]+msgcnt,
3654
*GA_Update_Signal, (int)proc_rem_snd);
3657
if (index[idx]%2 != 0 && !sprocflag) {
3658
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
3659
} else if (index[idx]%2 == 0 && !rprocflag) {
3660
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
3663
/* account for wrap-around boundary condition, if necessary */
3664
pmax = GA[handle].nblock[idx] - 1;
3665
if (index[idx]%2 != 0 && !rprocflag) {
3666
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
3667
} else if (index[idx]%2 == 0 && index[idx] != pmax && !sprocflag) {
3668
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
3672
ARMCI_PutS(ptr_snd, stride_snd, ptr_rem, stride_rem, count, ndim- 1,
3674
/* Send signal to remote processor that data transfer has been completed. */
3675
bytes = sizeof(int);
3676
ARMCI_Put(GA_Update_Signal, GA_Update_Flags[proc_rem_snd]+msgcnt, bytes,
3679
ARMCI_PutS_flag(ptr_snd, stride_snd, ptr_rem, stride_rem, count,
3680
(int)(ndim-1), GA_Update_Flags[proc_rem_snd]+msgcnt,
3681
*GA_Update_Signal, (int)proc_rem_snd);
3684
if (index[idx]%2 != 0 && !sprocflag) {
3685
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
3686
} else if (index[idx]%2 == 0 && index[idx] != 0 && !rprocflag) {
3687
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
3689
/* make up for odd processor at end of string */
3690
if (index[idx] == 0 && !rprocflag) {
3691
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
3693
if (index[idx] == pmax && !sprocflag) {
3694
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
3703
/* copy data back into global array */
3705
armci_read_strided(ptr_rcv, (int)ndim-1, stride_rcv, count, rcv_ptr);
3708
/* Find parameters for message in positive direction. */
3709
get_remote_block_pos(idx, ndim, lo_loc, hi_loc, slo_rcv, shi_rcv,
3711
/* locate processor with this data */
3712
if (!pnga_locate_region(g_a, slo_rcv, shi_rcv, _ga_map,
3713
GA_proclist, &np)) ga_RegionError(pnga_ndim(g_a),
3714
slo_rcv, shi_rcv, g_a);
3715
wproc = GA_proclist[0];
3716
if (p_handle >= 0) {
3717
wproc = PGRP_LIST[p_handle].inv_map_proc_list[wproc];
3719
rprocflag = ARMCI_Same_node(wproc);
3720
proc_rem_snd = GA_proclist[0];
3722
/* Find processor from which data will be recieved */
3723
for (i = 0; i < ndim; i++) {
3725
lo_rcv[i] = lo_loc[i] - width[i];
3726
hi_rcv[i] = lo_loc[i] - 1;
3728
lo_rcv[i] = lo_loc[i];
3729
hi_rcv[i] = hi_loc[i];
3733
/* Account for boundaries, if necessary. */
3734
for (i=0; i<ndim; i++) {
3736
if (hi_rcv[i] < 1) {
3737
slo_rcv[i] = dims[i] - width[i] + 1;
3738
shi_rcv[i] = dims[i];
3740
slo_rcv[i] = lo_rcv[i];
3741
shi_rcv[i] = hi_rcv[i];
3744
slo_rcv[i] = lo_rcv[i];
3745
shi_rcv[i] = hi_rcv[i];
3748
/* locate processor with this data */
3749
if (!pnga_locate_region(g_a, slo_rcv, shi_rcv, _ga_map,
3750
GA_proclist, &np)) ga_RegionError(pnga_ndim(g_a),
3751
slo_rcv, shi_rcv, g_a);
3752
wproc = GA_proclist[0];
3753
if (p_handle >= 0) {
3754
wproc = PGRP_LIST[p_handle].inv_map_proc_list[wproc];
3756
sprocflag = ARMCI_Same_node(wproc);
3757
proc_rem_rcv = GA_proclist[0];
3758
pnga_distribution(g_a, proc_rem_rcv, tlo_rem, thi_rem);
3760
/* Get actual coordinates of chunk of data that will be sent to
3761
* remote processor as well as coordinates of the array space that
3762
* will receive data from remote processor. */
3763
for (i = 0; i < ndim; i++) {
3764
if (increment[i] == 0) {
3766
plo_snd[i] = hi_loc[i] - lo_loc[i] + 1;
3767
phi_snd[i] = hi_loc[i] - lo_loc[i] + width[i];
3769
phi_rcv[i] = width[i] - 1;
3771
/*phi_rem[i] = width[i] - 1;*/
3773
plo_snd[i] = width[i];
3774
phi_snd[i] = hi_loc[i] - lo_loc[i] + width[i];
3775
plo_rcv[i] = width[i];
3776
phi_rcv[i] = hi_loc[i] - lo_loc[i] + width[i];
3777
plo_rem[i] = width[i];
3778
/*phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];*/
3782
phi_snd[i] = hi_loc[i] - lo_loc[i] + increment[i];
3784
phi_rcv[i] = hi_loc[i] - lo_loc[i] + increment[i];
3786
/*phi_rem[i] = thi_rem[i] - tlo_rem[i] + increment[i];*/
3790
/* Get pointer to local data buffer and remote data
3791
buffer as well as lists of leading dimenstions */
3792
gam_LocationWithGhosts(me, handle, plo_snd, &ptr_snd, ld_loc);
3793
gam_LocationWithGhosts(me, handle, plo_rcv, &ptr_rcv, ld_loc);
3794
gam_LocationWithGhosts(proc_rem_snd, handle, plo_rem, &ptr_rem, ld_rem);
3796
/* Evaluate strides for send and recieve */
3797
gam_setstride(ndim, size, ld_loc, ld_loc, stride_rcv,
3799
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
3802
/* Compute the number of elements in each dimension and store
3803
result in count. Scale the first element in count by the
3805
gam_ComputeCount(ndim, plo_rcv, phi_rcv, count);
3806
gam_CountElems(ndim, plo_snd, phi_snd, &length);
3810
/* if we are sending data to another node, use message passing */
3812
/* Fill send buffer with data. */
3813
armci_write_strided(ptr_snd, (int)ndim-1, stride_snd, count, snd_ptr);
3816
/* Send Messages. If processor has odd index in direction idx, it
3817
* sends message first, if processor has even index it receives
3818
* message first. Then process is reversed. Also need to account
3819
* for whether or not there are an odd number of processors along
3820
* update direction. */
3822
if (p_handle >= 0) {
3823
proc_rem_snd = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem_snd];
3824
proc_rem_rcv = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem_rcv];
3826
if (GA[handle].nblock[idx]%2 == 0) {
3827
if (index[idx]%2 != 0 && !rprocflag) {
3828
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
3829
} else if (index[idx]%2 == 0 && !sprocflag) {
3830
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
3834
ARMCI_PutS(ptr_snd, stride_snd, ptr_rem, stride_rem, count, ndim- 1,
3836
/* Send signal to remote processor that data transfer has been completed. */
3837
bytes = sizeof(int);
3838
ARMCI_Put(GA_Update_Signal, GA_Update_Flags[proc_rem_snd]+msgcnt, bytes,
3841
ARMCI_PutS_flag(ptr_snd, stride_snd, ptr_rem, stride_rem, count,
3842
(int)(ndim-1), GA_Update_Flags[proc_rem_snd]+msgcnt,
3843
*GA_Update_Signal, (int)proc_rem_snd);
3846
if (index[idx]%2 != 0 && !sprocflag) {
3847
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
3848
} else if (index[idx]%2 == 0 && !rprocflag) {
3849
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
3852
/* account for wrap-around boundary condition, if necessary */
3853
pmax = GA[handle].nblock[idx] - 1;
3854
if (index[idx]%2 != 0 && !rprocflag) {
3855
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
3856
} else if (index[idx]%2 == 0 && index[idx] != 0 && !sprocflag) {
3857
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
3861
ARMCI_PutS(ptr_snd, stride_snd, ptr_rem, stride_rem, count, ndim- 1,
3863
/* Send signal to remote processor that data transfer has been completed. */
3864
bytes = sizeof(int);
3865
ARMCI_Put(GA_Update_Signal, GA_Update_Flags[proc_rem_snd]+msgcnt, bytes,
3868
ARMCI_PutS_flag(ptr_snd, stride_snd, ptr_rem, stride_rem, count,
3869
(int)(ndim-1), GA_Update_Flags[proc_rem_snd]+msgcnt,
3870
*GA_Update_Signal, (int)proc_rem_snd);
3873
if (index[idx]%2 != 0 && !sprocflag) {
3874
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
3875
} else if (index[idx]%2 == 0 && index[idx] != pmax && !rprocflag) {
3876
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
3878
/* make up for odd processor at end of string */
3879
if (index[idx] == pmax && !rprocflag) {
3880
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
3882
if (index[idx] == 0 && !sprocflag) {
3883
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
3886
/* copy data back into global array */
3888
armci_read_strided(ptr_rcv, (int)ndim-1, stride_rcv, count, rcv_ptr);
3897
/* check to make sure any outstanding puts have showed up */
3898
waitformixedflags(flag1, flag2, GA_Update_Flags[GAme]+msgcnt-2,
3899
GA_Update_Flags[GAme]+msgcnt-1);
3900
/* update increment array */
3901
increment[idx] = 2*width[idx];
3904
ga_free(rcv_ptr_orig);
3905
ga_free(snd_ptr_orig);
3906
/* set update flags to zero for next operation */
3907
for (idx=0; idx < 2*ndim; idx++) {
3908
GA_Update_Flags[GAme][idx] = 0;
3915
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY USING GET CALLS
3917
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
3918
# pragma weak wnga_update7_ghosts = pnga_update7_ghosts
3920
logical pnga_update7_ghosts(Integer g_a)
3922
Integer idx, ipx, np, handle=GA_OFFSET + g_a, proc_rem;
3923
Integer ntot, mask[MAXDIM];
3924
Integer size, ndim, i, itmp;
3925
Integer width[MAXDIM], dims[MAXDIM];
3926
Integer lo_loc[MAXDIM], hi_loc[MAXDIM];
3927
Integer plo_loc[MAXDIM], phi_loc[MAXDIM];
3928
Integer tlo_rem[MAXDIM], thi_rem[MAXDIM];
3929
Integer plo_rem[MAXDIM];
3930
Integer ld_loc[MAXDIM], ld_rem[MAXDIM];
3932
int stride_loc[MAXDIM], stride_rem[MAXDIM],count[MAXDIM];
3933
char *ptr_loc, *ptr_rem;
3934
Integer me = pnga_nodeid();
3937
/* if global array has no ghost cells, just return */
3938
if (!pnga_has_ghosts(g_a)) {
3942
size = GA[handle].elemsize;
3943
ndim = GA[handle].ndim;
3944
p_handle = GA[handle].p_handle;
3945
/* initialize ghost cell widths and get array dimensions */
3946
for (idx=0; idx < ndim; idx++) {
3947
width[idx] = (Integer)GA[handle].width[idx];
3948
dims[idx] = (Integer)GA[handle].dims[idx];
3951
/* Check to make sure that global array is well-behaved (all processors
3952
have data and the width of the data in each dimension is greater than
3953
the corresponding value in width[]). */
3954
if (!gai_check_ghost_distr(g_a)) return FALSE;
3956
GA_PUSH_NAME("ga_update7_ghosts");
3957
/* Get pointer to local memory */
3958
ptr_loc = GA[handle].ptr[me];
3959
/* obtain range of data that is held by local processor */
3960
pnga_distribution(g_a,me,lo_loc,hi_loc);
3962
/* evaluate total number of GET operations that will be required */
3964
for (idx=0; idx < ndim; idx++) ntot *= 3;
3966
/* Loop over all GET operations. The operation corresponding to the
3967
mask of all zeros is left out. */
3968
for (ipx=0; ipx < ntot; ipx++) {
3969
/* Convert ipx to corresponding mask values */
3972
for (idx = 0; idx < ndim; idx++) {
3975
if (mask[idx] != 0) mask0 = FALSE;
3978
if (mask0) continue;
3980
/* check to see if ghost cell block has zero elements*/
3983
for (idx = 0; idx < ndim; idx++) {
3984
if (mask[idx] != 0 && width[idx] == 0) mask0 = TRUE;
3985
if (mask[idx] != 0) itmp++;
3987
/*if (itmp>1) mask0 = TRUE; */
3988
if (mask0) continue;
3989
/* Now that mask has been determined, find data that is to be moved
3990
* and identify processor from which it is coming. Wrap boundaries
3991
* around, if necessary */
3992
for (idx = 0; idx < ndim; idx++) {
3993
if (mask[idx] == 0) {
3994
tlo_rem[idx] = lo_loc[idx];
3995
thi_rem[idx] = hi_loc[idx];
3996
} else if (mask[idx] == -1) {
3997
if (lo_loc[idx] > 1) {
3998
tlo_rem[idx] = lo_loc[idx]-width[idx];
3999
thi_rem[idx] = lo_loc[idx]-1;
4001
tlo_rem[idx] = dims[idx]-width[idx]+1;
4002
thi_rem[idx] = dims[idx];
4004
} else if (mask[idx] == 1) {
4005
if (hi_loc[idx] < dims[idx]) {
4006
tlo_rem[idx] = hi_loc[idx] + 1;
4007
thi_rem[idx] = hi_loc[idx] + width[idx];
4010
thi_rem[idx] = width[idx];
4013
fprintf(stderr,"Illegal mask value found\n");
4016
/* Locate remote processor to which data must be sent */
4017
if (!pnga_locate_region(g_a, tlo_rem, thi_rem, _ga_map,
4018
GA_proclist, &np)) ga_RegionError(pnga_ndim(g_a),
4019
tlo_rem, thi_rem, g_a);
4021
fprintf(stderr,"More than one remote processor found\n");
4023
/* Remote processor has been identified, now get ready to send
4024
data to it. Start by getting distribution on remote
4026
proc_rem = GA_proclist[0];
4027
pnga_distribution(g_a, proc_rem, tlo_rem, thi_rem);
4028
for (idx = 0; idx < ndim; idx++) {
4029
if (mask[idx] == 0) {
4030
plo_loc[idx] = width[idx];
4031
phi_loc[idx] = hi_loc[idx]-lo_loc[idx]+width[idx];
4032
plo_rem[idx] = plo_loc[idx];
4033
} else if (mask[idx] == -1) {
4035
phi_loc[idx] = width[idx]-1;
4036
plo_rem[idx] = thi_rem[idx]-tlo_rem[idx]+1;
4037
} else if (mask[idx] == 1) {
4038
plo_loc[idx] = hi_loc[idx]-lo_loc[idx]+width[idx]+1;
4039
phi_loc[idx] = hi_loc[idx]-lo_loc[idx]+2*width[idx];
4040
plo_rem[idx] = width[idx];
4043
/* Get pointer to local data buffer and remote data
4044
buffer as well as lists of leading dimenstions */
4045
gam_LocationWithGhosts(me, handle, plo_loc, &ptr_loc, ld_loc);
4046
gam_LocationWithGhosts(proc_rem, handle, plo_rem, &ptr_rem, ld_rem);
4048
/* Evaluate strides on local and remote processors */
4049
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
4052
/* Compute the number of elements in each dimension and store
4053
result in count. Scale the first element in count by the
4055
gam_ComputeCount(ndim, plo_loc, phi_loc, count);
4058
if (p_handle >= 0) {
4059
proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem];
4061
/* put data on remote processor */
4062
/* ARMCI_GetS(ptr_rem, stride_rem, ptr_loc, stride_loc, count,
4063
(int)(ndim - 1), (int)proc_rem); */
4064
ARMCI_NbGetS(ptr_rem, stride_rem, ptr_loc, stride_loc, count,
4065
(int)(ndim - 1), (int)proc_rem, NULL);
4073
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
4074
# pragma weak wnga_ghost_barrier = pnga_ghost_barrier
4076
void pnga_ghost_barrier()
4079
int signal = 1, n = 1;
4082
armci_msg_igop(ptr,n,"+");
4084
armci_msg_barrier();
4087
/*\ UPDATE THE GHOST CELLS ON A PROCESSOR IN A SPECIFIC DIRECTION
4088
* USING NON-BLOCKING GET CALLS
4090
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
4091
# pragma weak wnga_nbget_ghost_dir = pnga_nbget_ghost_dir
4093
void pnga_nbget_ghost_dir(Integer g_a,
4097
Integer handle = GA_OFFSET + g_a;
4098
Integer lo_loc[MAXDIM], hi_loc[MAXDIM], lo_rem[MAXDIM], hi_rem[MAXDIM];
4099
Integer subscript[MAXDIM], ld[MAXDIM];
4100
Integer i, ndim, dim, width;
4102
Integer me = pnga_nodeid();
4103
/*Integer p_handle;*/
4104
GA_PUSH_NAME("nga_nbget_ghost_dir");
4105
ndim = GA[handle].ndim;
4106
/*p_handle = GA[handle].p_handle;*/
4107
/* check mask to see that it corresponds to a valid direction */
4108
for (i=0; i<ndim; i++) {
4109
if (abs(mask[i]) != 0 && abs(mask[i]) != 1)
4110
pnga_error("nga_nbget_ghost_dir: invalid mask entry", mask[i]);
4113
/* get range of data on local processor */
4114
pnga_distribution(g_a,me,lo_loc,hi_loc);
4116
/* locate data on remote processor */
4117
for (i=0; i<ndim; i++) {
4118
dim = (Integer)GA[handle].dims[i];
4119
width = (Integer)GA[handle].width[i];
4121
if (hi_loc[i] == dim) {
4125
lo_rem[i] = hi_loc[i]+1;
4126
hi_rem[i] = hi_loc[i]+width;
4128
} else if (mask[i] == -1) {
4129
if (lo_loc[i] == 1) {
4130
lo_rem[i] = dim - width + 1;
4133
lo_rem[i] = lo_loc[i] - width;
4134
hi_rem[i] = lo_loc[i] - 1;
4137
lo_rem[i] = lo_loc[i];
4138
hi_rem[i] = hi_loc[i];
4142
/* Get pointer to data destination on local block. Start by
4143
by finding subscript to origin of destination block */
4144
for (i=0; i<ndim; i++) {
4146
subscript[i] = hi_loc[i]-lo_loc[i]+1+GA[handle].width[i];
4147
} else if (mask[i] == -1) {
4150
subscript[i] = GA[handle].width[i];
4152
ld[i] = hi_loc[i]-lo_loc[i]+1+2*GA[handle].width[i];
4154
gam_LocationWithGhosts(me, handle, subscript, &ptr_loc, ld);
4156
pnga_nbget(g_a,lo_rem,hi_rem,ptr_loc,ld,nbhandle);
4160
/*\ SET PRECOMPUTED INFO FOR UPDATING GHOST CELLS
4162
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
4163
# pragma weak wnga_set_ghost_info = pnga_set_ghost_info
4165
logical pnga_set_ghost_info(Integer g_a)
4167
Integer handle = g_a + GA_OFFSET;
4168
if (GA[handle].cache != NULL)
4169
free(GA[handle].cache);
4170
GA[handle].cache = NULL;
4171
if (GA[handle].actv == 1) {
4173
return pnga_set_update5_info(g_a);
4175
return pnga_set_update4_info(g_a);
4181
/*\ SET FLAG ON WHETHER OR NOT TO UPDATE GHOST CELL CORNER DATA
4183
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
4184
# pragma weak wnga_set_ghost_corner_flag = pnga_set_ghost_corner_flag
4186
void pnga_set_ghost_corner_flag(Integer g_a, logical flag)
4188
Integer handle = g_a + GA_OFFSET;
4189
GA[handle].corner_flag = (int)flag;
4190
if (GA[handle].actv == 1) {
4191
(void)pnga_set_ghost_info(g_a);