1
/* $Id: ghosts.c,v 1.47.4.2 2007-05-02 16:23:39 d3g293 Exp $ */
5
* description: implements GA collective communication operations to
6
* update ghost cell regions.
10
* This material was prepared as an account of work sponsored by an
11
* agency of the United States Government. Neither the United States
12
* Government nor the United States Department of Energy, nor Battelle,
13
* nor any of their employees, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
14
* ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
15
* COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
16
* SOFTWARE, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT
17
* INFRINGE PRIVATELY OWNED RIGHTS.
22
* This software and its documentation were produced with United States
23
* Government support under Contract Number DE-AC06-76RLO-1830 awarded by
24
* the United States Department of Energy. The United States Government
25
* retains a paid-up non-exclusive, irrevocable worldwide license to
26
* reproduce, prepare derivative works, perform publicly and display
27
* publicly by or for the US Government, including the right to
28
* distribute to other US Government contractors.
32
/*#define PERMUTE_PIDS */
45
#define INVALID_MA_HANDLE -1
46
#define NEAR_INT(x) (x)< 0.0 ? ceil( (x) - 0.5) : floor((x) + 0.5)
48
#if !defined(CRAY_YMP)
49
#define BYTE_ADDRESSABLE_MEMORY
53
/*uncomment line below to verify consistency of MA in every sync */
54
/*#define CHECK_MA yes */
56
/***************************************************************************/
58
/*\ Return a pointer to the location indicated by subscript and and an array
59
* of leading dimensions (ld). Assume that subscript refers to a set of local
60
* coordinates relative to the origin of the array and account for the
61
* presence of ghost cells.
63
#define gam_LocationWithGhosts(proc, handle, subscript, ptr_loc, ld) \
65
Integer _d, _factor = 1, _last=GA[handle].ndim - 1, _offset=0; \
66
Integer _lo[MAXDIM], _hi[MAXDIM]; \
67
ga_ownsM(handle, proc, _lo, _hi); \
68
if (_last == 0) ld[0] = _hi[0] - _lo[0] + 1 + 2*(Integer)GA[handle].width[0];\
69
for (_d = 0; _d < _last; _d++) { \
70
_offset += subscript[_d] * _factor; \
71
ld[_d] = _hi[_d] - _lo[_d] + 1 + 2*(Integer)GA[handle].width[_d]; \
74
_offset += subscript[_last] * _factor; \
75
*(ptr_loc) = GA[handle].ptr[proc] + _offset*GA[handle].elemsize; \
78
void nga_access_ghost_ptr(Integer* g_a, Integer dims[],
79
void* ptr, Integer ld[])
83
Integer handle = GA_OFFSET + *g_a;
84
Integer i, lo[MAXDIM], hi[MAXDIM];
85
Integer ndim = GA[handle].ndim;
86
Integer me = ga_nodeid_();
88
GA_PUSH_NAME("nga_access_ghost_ptr");
90
nga_distribution_(g_a, &me, lo, hi);
92
for (i=0; i < ndim; i++) {
96
gam_LocationWithGhosts(me, handle, dims, &lptr, ld);
98
for (i=0; i < ndim; i++)
99
dims[i] = hi[i] - lo[i] + 1 + 2*(Integer)GA[handle].width[i];
103
/*\ PROVIDE INDEX TO LOCALLY HELD DATA, ACCOUNTING FOR
104
* PRESENCE OF GHOST CELLS
106
void FATR nga_access_ghost_element_(Integer* g_a, Integer* index,
107
Integer subscript[], Integer ld[])
110
Integer handle = GA_OFFSET + *g_a;
112
Integer tmp_sub[MAXDIM];
113
unsigned long elemsize;
114
unsigned long lref, lptr;
115
Integer me = ga_nodeid_();
116
GA_PUSH_NAME("nga_access_ghost_element");
117
/* Indices conform to Fortran convention. Shift them down 1 so that
118
gam_LocationWithGhosts works. */
119
for (i=0; i<GA[handle].ndim; i++) tmp_sub[i] = subscript[i] - 1;
120
gam_LocationWithGhosts(me, handle, tmp_sub, &ptr, ld);
122
* return patch address as the distance elements from the reference address
124
* .in Fortran we need only the index to the type array: dbl_mb or int_mb
125
* that are elements of COMMON in the the mafdecls.h include file
126
* .in C we need both the index and the pointer
129
elemsize = (unsigned long)GA[handle].elemsize;
131
/* compute index and check if it is correct */
132
switch (ga_type_c2f(GA[handle].type)){
134
*index = (Integer) ((DoublePrecision*)ptr - DBL_MB);
135
lref = (unsigned long)DBL_MB;
139
*index = (Integer) ((DoubleComplex*)ptr - DCPL_MB);
140
lref = (unsigned long)DCPL_MB;
144
*index = (Integer) ((SingleComplex*)ptr - SCPL_MB);
145
lref = (unsigned long)SCPL_MB;
149
*index = (Integer) ((Integer*)ptr - INT_MB);
150
lref = (unsigned long)INT_MB;
154
*index = (Integer) ((float*)ptr - FLT_MB);
155
lref = (unsigned long)FLT_MB;
159
#ifdef BYTE_ADDRESSABLE_MEMORY
160
/* check the allignment */
161
lptr = (unsigned long)ptr;
162
if( lptr%elemsize != lref%elemsize ){
163
printf("%d: lptr=%lu(%lu) lref=%lu(%lu)\n",(int)GAme,lptr,lptr%elemsize,
165
ga_error("nga_access: MA addressing problem: base address misallignment",
170
/* adjust index for Fortran addressing */
177
/*\ PROVIDE POINTER TO LOCALLY HELD DATA, ACCOUNTING FOR
178
* PRESENCE OF GHOST CELLS
180
void FATR nga_access_ghost_element_ptr(Integer* g_a, void *ptr,
181
Integer subscript[], Integer ld[])
184
Integer handle = GA_OFFSET + *g_a;
186
Integer tmp_sub[MAXDIM];
187
Integer me = ga_nodeid_();
188
GA_PUSH_NAME("nga_access_ghost_element_ptr");
189
/* Indices conform to Fortran convention. Shift them down 1 so that
190
gam_LocationWithGhosts works. */
191
for (i=0; i<GA[handle].ndim; i++) tmp_sub[i] = subscript[i] - 1;
192
gam_LocationWithGhosts(me, handle, tmp_sub, &ptr, ld);
198
/*\ PROVIDE ACCESS TO LOCAL PATCH OF A GLOBAL ARRAY WITH GHOST CELLS
200
void FATR nga_access_ghosts_(Integer* g_a, Integer dims[],
201
Integer* index, Integer ld[])
204
Integer handle = GA_OFFSET + *g_a;
205
unsigned long elemsize;
206
unsigned long lref, lptr;
208
GA_PUSH_NAME("nga_access_ghosts");
209
nga_access_ghost_ptr(g_a, dims, &ptr, ld);
212
* return patch address as the distance elements from the reference address
214
* .in Fortran we need only the index to the type array: dbl_mb or int_mb
215
* that are elements of COMMON in the the mafdecls.h include file
216
* .in C we need both the index and the pointer
219
elemsize = (unsigned long)GA[handle].elemsize;
221
/* compute index and check if it is correct */
222
switch (ga_type_c2f(GA[handle].type)){
224
*index = (Integer) ((DoublePrecision*)ptr - DBL_MB);
225
lref = (unsigned long)DBL_MB;
229
*index = (Integer) ((DoubleComplex*)ptr - DCPL_MB);
230
lref = (unsigned long)DCPL_MB;
234
*index = (Integer) ((SingleComplex*)ptr - SCPL_MB);
235
lref = (unsigned long)SCPL_MB;
239
*index = (Integer) ((Integer*)ptr - INT_MB);
240
lref = (unsigned long)INT_MB;
244
*index = (Integer) ((float*)ptr - FLT_MB);
245
lref = (unsigned long)FLT_MB;
250
#ifdef BYTE_ADDRESSABLE_MEMORY
251
/* check the allignment */
252
lptr = (unsigned long)ptr;
253
if( lptr%elemsize != lref%elemsize ){
254
printf("%d: lptr=%lu(%lu) lref=%lu(%lu)\n",(int)GAme,lptr,lptr%elemsize,
256
ga_error("nga_access: MA addressing problem: base address misallignment",
261
/* adjust index for Fortran addressing */
268
/*\ RELEASE ACCESS TO A GHOST ELEMENT
270
void FATR nga_release_ghost_element_(Integer* g_a, Integer subscript[])
274
/*\ RELEASE ACCESS & UPDATE A GHOST ELEMENT
276
void FATR nga_release_update_ghost_element_(Integer* g_a, Integer subscript[])
280
/*\ RELEASE ACCESS TO A GHOST BLOCK
282
void FATR nga_release_ghosts_(Integer* g_a)
286
/*\ RELEASE ACCESS & UPDATE A GHOST BLOCK
288
void FATR nga_release_update_ghosts_(Integer* g_a)
292
/*\ GET DATA FROM LOCAL BLOCK
294
void FATR nga_get_ghost_block_(Integer *g_a,
300
/* g_a: Global array handle
301
lo[]: Array of lower indices of patch of global array
302
hi[]: Array of upper indices of patch of global array
303
buf[]: Local buffer that array patch will be copied into
304
ld[]: Array of physical ndim-1 dimensions of local buffer */
305
Integer handle=GA_OFFSET + *g_a, ndim;
306
Integer i, glo[MAXDIM], ghi[MAXDIM], ichk, me, grp_id;
308
int stride_rem[MAXDIM], stride_loc[MAXDIM], count[MAXDIM];
309
Integer ldrem[MAXDIM];
310
Integer offset, factor, size;
314
grp_id = (Integer)GA[handle].p_handle;
315
if (grp_id>0) me = PGRP_LIST[grp_id].map_proc_list[me];
316
ndim = GA[handle].ndim;
318
/* Figure out whether or not lo and hi can be accessed completely
320
nga_distribution_(g_a, &me, glo, ghi);
322
for (i=0; i<ndim; i++) {
323
if (lo[i] < glo[i]-(Integer)GA[handle].width[i]) ichk = 0;
324
if (hi[i] > ghi[i]+(Integer)GA[handle].width[i]) ichk = 0;
325
llo[i] = glo[i] - (Integer)GA[handle].width[i];
326
if (i<ndim-1) ldrem[i] = ghi[i] - glo[i] + 1
327
+ 2*(Integer)GA[handle].width[i];
330
/* Get data. Use local copy if possible, otherwise use a periodic get */
334
size = GA[handle].elemsize;
335
for (i=0; i<ndim-1; i++) {
336
offset += (lo[i]-llo[i])*factor;
337
factor *= ghi[i] - glo[i] + 1 + 2*(Integer)GA[handle].width[i];
339
offset += (lo[ndim-1]-llo[ndim-1])*factor;
340
ptr = GA[handle].ptr[me] + size*offset;
341
/* compute number of elements in each dimension and store result in count */
342
gam_ComputeCount(ndim, lo, hi, count);
344
/* scale first element in count by element size. The ARMCI_GetS
345
routine uses this convention to figure out memory sizes.*/
348
/* Return strides for memory containing global array on remote
349
processor indexed by proc (stride_rem) and for local buffer
351
gam_setstride(ndim, size, ld, ldrem, stride_rem, stride_loc);
352
ARMCI_GetS(ptr,stride_rem,buf,stride_loc,count,ndim-1,me);
354
nga_periodic_get_(g_a,lo,hi,buf,ld);
358
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY USING SHIFT ALGORITHM
360
void FATR ga_update1_ghosts_(Integer *g_a)
362
Integer idx, ipx, inx, i, np, handle=GA_OFFSET + *g_a, proc_rem;
363
Integer size, ndim, nwidth, offset, slice, increment[MAXDIM];
364
Integer width[MAXDIM];
365
Integer dims[MAXDIM], imax;
366
Integer lo_loc[MAXDIM], hi_loc[MAXDIM];
367
Integer plo_loc[MAXDIM], phi_loc[MAXDIM];
368
Integer lo_rem[MAXDIM], hi_rem[MAXDIM];
369
Integer tlo_rem[MAXDIM], thi_rem[MAXDIM];
370
Integer slo_rem[MAXDIM], shi_rem[MAXDIM];
371
Integer plo_rem[MAXDIM], phi_rem[MAXDIM];
372
Integer ld_loc[MAXDIM], ld_rem[MAXDIM];
374
int stride_loc[MAXDIM], stride_rem[MAXDIM], count[MAXDIM];
375
char *ptr_loc, *ptr_rem;
376
logical hasData = TRUE;
377
Integer me = ga_nodeid_();
380
/* This routine makes use of the shift algorithm to update data in the
381
* ghost cells bounding the local block of visible data. The shift
382
* algorithm starts by updating the blocks of data along the first
383
* dimension by grabbing a block of data that is width[0] deep but
384
* otherwise matches the dimensions of the data residing on the
385
* calling processor. The update of the second dimension, however,
386
* grabs a block that is width[1] deep in the second dimension but is
387
* ldim0 + 2*width[0] in the first dimensions where ldim0 is the
388
* size of the visible data along the first dimension. The remaining
389
* dimensions are left the same. For the next update, the width of the
390
* second dimension is also increased by 2*width[1] and so on. This
391
* algorith makes use of the fact that data for the dimensions that
392
* have already been updated is available on each processor and can be
393
* used in the updates of subsequent dimensions. The total number of
394
* separate updates is 2*ndim, an update in the negative and positive
395
* directions for each dimension.
397
* To perform the update, this routine makes use of several copies of
398
* indices marking the upper and lower limits of data. Indices
399
* beginning with the character "p" are relative indices marking the
400
* location of the data set relative to the origin the local patch of
401
* the global array, all other indices are in absolute coordinates and
402
* mark locations in the total global array. The indices used by this
403
* routine are described below.
405
* lo_loc[], hi_loc[]: The lower and upper indices of the visible
406
* block of data held by the calling processor.
408
* lo_rem[], hi_rem[]: The lower and upper indices of the block
409
* of data on a remote processor or processors that is needed to
410
* fill in the calling processors ghost cells. These indices are
411
* NOT corrected for wrap-around (periodic) boundary conditions
412
* so they can be negative or greater than the array dimension
413
* values held in dims[].
415
* slo_rem[], shi_rem[]: Similar to lo_rem[] and hi_rem[], except
416
* that these indices have been corrected for wrap-around
417
* boundary conditions. If lo_rem[] and hi_rem[] cross a global
418
* array boundary, as opposed to being entirely located on one
419
* side or the other of the array, then two sets of slo_rem[] and
420
* shi_rem[] will be created. One set will correspond to the
421
* block of data on one side of the global array boundary and the
422
* other set will correspond to the remaining block. This
423
* situation will only occur if the value of the ghost cell width
424
* is greater than the dimension of the visible global array
425
* data on a single processor.
427
* thi_rem[], thi_rem[]: The lower and upper indices of the visible
428
* data on a remote processor.
430
* plo_loc[], phi_loc[]: The indices of the local data patch that
431
* is going to be updated.
433
* plo_rem[], phi_rem[]: The indices of the data patch on the
434
* remote processor that will be used to update the data on the
435
* calling processor. Note that the dimensions of the patches
436
* represented by plo_loc[], plo_rem[] and plo_loc[], phi_loc[]
439
* For the case where the width of the ghost cells is more than the
440
* width of the visible data held on a processor, special problems
441
* arise. It now takes several updates to fill in one block of boundary
442
* data and it is now necessary to keep track of where each of these
443
* blocks of data go in the ghost cell region. To do this two extra
444
* variables are needed. These are offset and slice. Slice is equal to
445
* the width of the visible data along the dimension being updated
446
* minus one coming from the remote processor. Offset is the amount
447
* that this data must be moved inward from the lower boundary of the
448
* ghost cell region. Another variable that is also used to handle
449
* this case is imax. If this variable is set to 2, then this means
450
* that the block of data that is needed to update the ghost cells
451
* crosses a global array boundary and the block needs to be broken
452
* up into two pieces. */
454
/* if global array has no ghost cells, just return */
455
if (!ga_has_ghosts_(g_a)) return;
457
GA_PUSH_NAME("ga_update1_ghosts");
459
size = GA[handle].elemsize;
460
ndim = GA[handle].ndim;
461
corner_flag = GA[handle].corner_flag;
462
p_handle = GA[handle].p_handle;
464
/* Get pointer to local memory */
465
ptr_loc = GA[handle].ptr[me];
466
/* obtain range of data that is held by local processor */
467
nga_distribution_(g_a,&me,lo_loc,hi_loc);
468
/* initialize range increments and get array dimensions */
469
for (idx=0; idx < ndim; idx++) {
471
width[idx] = (Integer)GA[handle].width[idx];
472
dims[idx] = (Integer)GA[handle].dims[idx];
473
if (lo_loc[idx] == 0 && hi_loc[idx] == -1) hasData = FALSE;
476
/* loop over dimensions for sequential update using shift algorithm */
477
for (idx=0; idx < ndim; idx++) {
478
nwidth = (Integer)width[idx];
480
/* Do not bother with update if nwidth is zero or processor has
482
if (nwidth != 0 && hasData) {
484
/* Perform update in negative direction. Start by getting rough
485
estimate of block of needed data*/
486
for (i = 0; i < ndim; i++) {
488
lo_rem[i] = lo_loc[i] - nwidth;
489
hi_rem[i] = lo_loc[i] - 1;
490
/* Check to see if we will need to update ghost cells using
491
one or two major patches of the global array. */
502
lo_rem[i] = lo_loc[i];
503
hi_rem[i] = hi_loc[i];
507
for (inx = 0; inx < imax; inx++) {
508
/* Check to see if boundary is being updated in one patch or two,
509
adjust lower boundary accordingly. */
510
for (i=0; i<ndim; i++) {
511
if (imax == 2 && i == idx) {
514
shi_rem[i] = hi_rem[i];
516
slo_rem[i] = lo_rem[i] + dims[i];
517
shi_rem[i] = dims[i];
519
} else if (i == idx) {
521
slo_rem[i] = dims[i] - nwidth + 1;
522
shi_rem[i] = dims[i];
524
slo_rem[i] = lo_rem[i];
525
shi_rem[i] = hi_rem[i];
528
slo_rem[i] = lo_rem[i];
529
shi_rem[i] = hi_rem[i];
532
/* locate processor with this data */
533
if (!nga_locate_region_(g_a, slo_rem, shi_rem, _ga_map,
534
GA_proclist, &np)) ga_RegionError(ga_ndim_(g_a),
535
slo_rem, shi_rem, *g_a);
537
for (ipx = 0; ipx < np; ipx++) {
538
/* Get actual coordinates of desired chunk of remote
539
data as well as the actual coordinates of the local chunk
540
of data that will receive the remote data (these
541
coordinates take into account the presence of ghost
542
cells). Start by finding out what data is actually held by
544
proc_rem = GA_proclist[ipx];
545
nga_distribution_(g_a, &proc_rem, tlo_rem, thi_rem);
546
for (i = 0; i < ndim; i++) {
547
if (increment[i] == 0) {
549
if (np == 1 && imax == 1) {
550
plo_rem[i] = thi_rem[i] - tlo_rem[i] + 1;
551
phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];
553
phi_loc[i] = width[i] - 1;
555
if (tlo_rem[i] >= slo_rem[i]) {
556
offset = tlo_rem[i] - lo_rem[i];
557
slice = thi_rem[i] - tlo_rem[i];
560
slice = thi_rem[i] - slo_rem[i];
562
if (offset < 0) offset = offset + dims[i];
563
if (offset >= dims[i]) offset = offset - dims[i];
564
plo_rem[i] = thi_rem[i] - tlo_rem[i] + width[i] - slice;
565
phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];
567
phi_loc[i] = offset + slice;
570
plo_rem[i] = width[i];
571
phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];
572
plo_loc[i] = width[i];
573
phi_loc[i] = hi_loc[i] - lo_loc[i] + width[i];
577
phi_rem[i] = thi_rem[i] - tlo_rem[i] + increment[i];
579
phi_loc[i] = hi_loc[i] - lo_loc[i] + increment[i];
583
/* Get pointer to local data buffer and remote data
584
buffer as well as lists of leading dimenstions */
585
gam_LocationWithGhosts(me, handle, plo_loc, &ptr_loc, ld_loc);
586
gam_LocationWithGhosts(proc_rem, handle, plo_rem, &ptr_rem, ld_rem);
588
/* Evaluate strides on local and remote processors */
589
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
592
/* Compute the number of elements in each dimension and store
593
result in count. Scale the first element in count by the
595
gam_ComputeCount(ndim, plo_rem, phi_rem, count);
598
/* get remote data */
600
proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem];
602
ARMCI_GetS(ptr_rem, stride_rem, ptr_loc, stride_loc, count,
603
(int)(ndim - 1), (int)proc_rem);
607
/* Perform update in positive direction. Start by getting rough
608
estimate of block of needed data*/
609
for (i = 0; i < ndim; i++) {
611
lo_rem[i] = hi_loc[i] + 1;
612
hi_rem[i] = hi_loc[i] + nwidth;
613
/* Check to see if we will need to update ghost cells using
614
one or two major patches of the global array. */
615
if (hi_rem[i] > dims[i]) {
616
if (lo_rem[i] <= dims[i]) {
625
lo_rem[i] = lo_loc[i];
626
hi_rem[i] = hi_loc[i];
630
for (inx = 0; inx < imax; inx++) {
631
/* Check to see if boundary is being updated in one patch or two,
632
adjust lower boundary accordingly. */
633
for (i=0; i<ndim; i++) {
634
if (imax == 2 && i == idx) {
636
slo_rem[i] = lo_rem[i];
637
shi_rem[i] = dims[i];
640
shi_rem[i] = hi_rem[i] - dims[i];
642
} else if (i == idx) {
643
if (hi_rem[i] > dims[i]) {
647
slo_rem[i] = lo_rem[i];
648
shi_rem[i] = hi_rem[i];
651
slo_rem[i] = lo_rem[i];
652
shi_rem[i] = hi_rem[i];
655
/* locate processor with this data */
656
if (!nga_locate_region_(g_a, slo_rem, shi_rem, _ga_map,
657
GA_proclist, &np)) ga_RegionError(ga_ndim_(g_a),
658
slo_rem, shi_rem, *g_a);
660
for (ipx = 0; ipx < np; ipx++) {
661
/* Get actual coordinates of desired chunk of remote
662
data as well as the actual coordinates of the local chunk
663
of data that will receive the remote data (these
664
coordinates take into account the presence of ghost
665
cells). Start by finding out what data is actually held by
667
proc_rem = GA_proclist[ipx];
668
nga_distribution_(g_a, &proc_rem, tlo_rem, thi_rem);
669
for (i = 0; i < ndim; i++) {
670
if (increment[i] == 0) {
672
if (np == 1 && imax == 1) {
673
plo_rem[i] = width[i];
674
phi_rem[i] = 2*width[i] - 1;
675
plo_loc[i] = hi_loc[i] - lo_loc[i] + 1 + width[i];
676
phi_loc[i] = hi_loc[i] - lo_loc[i] + 2*width[i];
678
offset = tlo_rem[i] - hi_loc[i] - 1;
679
if (thi_rem[i] <= shi_rem[i]) {
680
slice = thi_rem[i] - tlo_rem[i];
682
slice = shi_rem[i] - tlo_rem[i];
684
if (offset < 0) offset = offset + dims[i];
685
if (offset >= dims[i]) offset = offset - dims[i];
686
plo_rem[i] = width[i];
687
phi_rem[i] = width[i] + slice;
688
plo_loc[i] = hi_loc[i] - lo_loc[i] + width[i] + 1 + offset;
689
phi_loc[i] = hi_loc[i] - lo_loc[i] + width[i] + 1
693
plo_rem[i] = width[i];
694
phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];
695
plo_loc[i] = width[i];
696
phi_loc[i] = hi_loc[i] - lo_loc[i] + width[i];
700
phi_rem[i] = thi_rem[i] - tlo_rem[i] + increment[i];
702
phi_loc[i] = hi_loc[i] - lo_loc[i] + increment[i];
706
/* Get pointer to local data buffer and remote data
707
buffer as well as lists of leading dimenstions */
708
gam_LocationWithGhosts(me, handle, plo_loc, &ptr_loc, ld_loc);
709
gam_LocationWithGhosts(proc_rem, handle, plo_rem, &ptr_rem, ld_rem);
711
/* Evaluate strides on local and remote processors */
712
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
715
/* Compute the number of elements in each dimension and store
716
result in count. Scale the first element in count by the
718
gam_ComputeCount(ndim, plo_rem, phi_rem, count);
721
/* get remote data */
723
proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem];
725
ARMCI_GetS(ptr_rem, stride_rem, ptr_loc, stride_loc, count,
726
(int)(ndim - 1), (int)proc_rem);
730
/* synchronize all processors and update increment array */
731
if (idx < ndim-1) ga_sync_();
733
increment[idx] = 2*nwidth;
739
/*\ UTILITY FUNCTION TO MAKE SURE GHOST CELLS WIDTHS ARE
740
* LESS THAN VISIBLE DATA WIDTHS
742
logical gai_check_ghost_distr(Integer *g_a)
744
Integer handle=GA_OFFSET + *g_a;
745
Integer idx, ndim, np, ipx;
746
ndim = GA[handle].ndim;
748
for (idx = 0; idx < ndim; idx++) {
749
for (np = 0; np < GA[handle].nblock[idx]; np++) {
750
if (np < GA[handle].nblock[idx] - 1) {
751
if (GA[handle].mapc[ipx+1]-GA[handle].mapc[ipx]+1
752
<GA[handle].width[idx]) {
756
if (GA[handle].dims[idx]-GA[handle].mapc[ipx]+1
757
<GA[handle].width[idx]) {
767
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY USING PUT CALLS
769
logical FATR ga_update2_ghosts_(Integer *g_a)
771
Integer idx, ipx, np, handle=GA_OFFSET + *g_a, proc_rem;
772
Integer ntot, mask[MAXDIM];
773
Integer size, ndim, i, itmp;
774
Integer width[MAXDIM], dims[MAXDIM];
775
Integer lo_loc[MAXDIM], hi_loc[MAXDIM];
776
Integer tlo_loc[MAXDIM], thi_loc[MAXDIM];
777
Integer plo_loc[MAXDIM], phi_loc[MAXDIM];
778
Integer tlo_rem[MAXDIM], thi_rem[MAXDIM];
779
Integer plo_rem[MAXDIM];
780
Integer ld_loc[MAXDIM], ld_rem[MAXDIM];
782
int stride_loc[MAXDIM], stride_rem[MAXDIM],count[MAXDIM];
783
char *ptr_loc, *ptr_rem;
784
Integer me = ga_nodeid_();
787
/* if global array has no ghost cells, just return */
788
if (!ga_has_ghosts_(g_a)) {
792
size = GA[handle].elemsize;
793
ndim = GA[handle].ndim;
794
p_handle = GA[handle].p_handle;
795
/* initialize ghost cell widths and get array dimensions */
796
for (idx=0; idx < ndim; idx++) {
797
width[idx] = (Integer)GA[handle].width[idx];
798
dims[idx] = (Integer)GA[handle].dims[idx];
801
/* Check to make sure that global array is well-behaved (all processors
802
have data and the width of the data in each dimension is greater than
803
the corresponding value in width[]). */
804
if (!gai_check_ghost_distr(g_a)) return FALSE;
806
GA_PUSH_NAME("ga_update2_ghosts");
807
/* Get pointer to local memory */
808
ptr_loc = GA[handle].ptr[me];
809
/* obtain range of data that is held by local processor */
810
nga_distribution_(g_a,&me,lo_loc,hi_loc);
812
/* evaluate total number of PUT operations that will be required */
814
for (idx=0; idx < ndim; idx++) ntot *= 3;
816
/* Loop over all PUT operations. The operation corresponding to the
817
mask of all zeros is left out. */
818
for (ipx=0; ipx < ntot; ipx++) {
819
/* Convert ipx to corresponding mask values */
822
for (idx = 0; idx < ndim; idx++) {
825
if (mask[idx] != 0) mask0 = FALSE;
830
/* check to see if ghost cell block has zero elements*/
833
for (idx = 0; idx < ndim; idx++) {
834
if (mask[idx] != 0 && width[idx] == 0) mask0 = TRUE;
835
if (mask[idx] != 0) itmp++;
837
/*if (itmp>1) mask0 = TRUE; */
839
/* Now that mask has been determined, find data that is to be moved
840
* and identify processor to which it is going. Wrap boundaries
841
* around, if necessary */
842
for (idx = 0; idx < ndim; idx++) {
843
if (mask[idx] == 0) {
844
tlo_loc[idx] = lo_loc[idx];
845
thi_loc[idx] = hi_loc[idx];
846
tlo_rem[idx] = lo_loc[idx];
847
thi_rem[idx] = hi_loc[idx];
848
} else if (mask[idx] == -1) {
849
tlo_loc[idx] = lo_loc[idx];
850
thi_loc[idx] = lo_loc[idx]+width[idx]-1;
851
if (lo_loc[idx] > 1) {
852
tlo_rem[idx] = lo_loc[idx]-width[idx];
853
thi_rem[idx] = lo_loc[idx]-1;
855
tlo_rem[idx] = dims[idx]-width[idx]+1;
856
thi_rem[idx] = dims[idx];
858
} else if (mask[idx] == 1) {
859
tlo_loc[idx] = hi_loc[idx]-width[idx]+1;
860
thi_loc[idx] = hi_loc[idx];
861
if (hi_loc[idx] < dims[idx]) {
862
tlo_rem[idx] = hi_loc[idx] + 1;
863
thi_rem[idx] = hi_loc[idx] + width[idx];
866
thi_rem[idx] = width[idx];
869
fprintf(stderr,"Illegal mask value found\n");
872
/* Locate remote processor to which data must be sent */
873
if (!nga_locate_region_(g_a, tlo_rem, thi_rem, _ga_map,
874
GA_proclist, &np)) ga_RegionError(ga_ndim_(g_a),
875
tlo_rem, thi_rem, *g_a);
877
fprintf(stderr,"More than one remote processor found\n");
879
/* Remote processor has been identified, now get ready to send
880
data to it. Start by getting distribution on remote
882
proc_rem = GA_proclist[0];
883
nga_distribution_(g_a, &proc_rem, tlo_rem, thi_rem);
884
for (idx = 0; idx < ndim; idx++) {
885
if (mask[idx] == 0) {
886
plo_loc[idx] = width[idx];
887
phi_loc[idx] = hi_loc[idx]-lo_loc[idx]+width[idx];
888
plo_rem[idx] = plo_loc[idx];
889
} else if (mask[idx] == -1) {
890
plo_loc[idx] = width[idx];
891
phi_loc[idx] = 2*width[idx]-1;
892
plo_rem[idx] = thi_rem[idx]-tlo_rem[idx]+width[idx]+1;
893
} else if (mask[idx] == 1) {
894
plo_loc[idx] = hi_loc[idx]-lo_loc[idx]+1;
895
phi_loc[idx] = hi_loc[idx]-lo_loc[idx]+width[idx];
899
/* Get pointer to local data buffer and remote data
900
buffer as well as lists of leading dimenstions */
901
gam_LocationWithGhosts(me, handle, plo_loc, &ptr_loc, ld_loc);
902
gam_LocationWithGhosts(proc_rem, handle, plo_rem, &ptr_rem, ld_rem);
904
/* Evaluate strides on local and remote processors */
905
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
908
/* Compute the number of elements in each dimension and store
909
result in count. Scale the first element in count by the
911
gam_ComputeCount(ndim, plo_loc, phi_loc, count);
914
/* put data on remote processor */
915
/*ARMCI_PutS(ptr_loc, stride_loc, ptr_rem, stride_rem, count,
916
(int)(ndim - 1), (int)proc_rem);*/
918
proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem];
920
ARMCI_NbPutS(ptr_loc, stride_loc, ptr_rem, stride_rem, count,
921
(int)(ndim - 1), (int)proc_rem, NULL);
929
/*\ GET INDICES ON REMOTE BLOCK IN NEGATIVE DIRECTION FOR UPDATE
931
void get_remote_block_neg(Integer idx, Integer ndim, Integer *lo_loc,
932
Integer *hi_loc, Integer *slo_rem, Integer *shi_rem,
933
Integer *dims, Integer *width)
935
Integer i, lo_rem[MAXDIM], hi_rem[MAXDIM];
936
/*Start by getting rough idea of where data needs to go. */
937
for (i = 0; i < ndim; i++) {
939
lo_rem[i] = lo_loc[i] - width[i];
940
hi_rem[i] = lo_loc[i] - 1;
942
lo_rem[i] = lo_loc[i];
943
hi_rem[i] = hi_loc[i];
947
/* Account for boundaries, if necessary. */
948
for (i=0; i<ndim; i++) {
951
slo_rem[i] = dims[i] - width[i] + 1;
952
shi_rem[i] = dims[i];
954
slo_rem[i] = lo_rem[i];
955
shi_rem[i] = hi_rem[i];
958
slo_rem[i] = lo_rem[i];
959
shi_rem[i] = hi_rem[i];
964
/*\ GET INDICES ON REMOTE BLOCK IN POSITIVE DIRECTION FOR UPDATE
966
void get_remote_block_pos(Integer idx, Integer ndim, Integer *lo_loc,
967
Integer *hi_loc, Integer *slo_rem, Integer *shi_rem,
968
Integer *dims, Integer *width)
970
Integer i, lo_rem[MAXDIM], hi_rem[MAXDIM];
971
/* Start by getting rough idea of where data needs to go. */
972
for (i = 0; i < ndim; i++) {
974
lo_rem[i] = hi_loc[i] + 1;
975
hi_rem[i] = hi_loc[i] + width[i];
977
lo_rem[i] = lo_loc[i];
978
hi_rem[i] = hi_loc[i];
982
/* Account for boundaries, if necessary. */
983
for (i=0; i<ndim; i++) {
985
if (hi_rem[i] > dims[i]) {
987
shi_rem[i] = width[i];
989
slo_rem[i] = lo_rem[i];
990
shi_rem[i] = hi_rem[i];
993
slo_rem[i] = lo_rem[i];
994
shi_rem[i] = hi_rem[i];
999
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY USING SHIFT ALGORITHM AND PUT CALLS
1001
logical FATR ga_update3_ghosts_(Integer *g_a)
1003
Integer idx, i, np, handle=GA_OFFSET + *g_a, proc_rem;
1004
Integer size, ndim, nwidth, increment[MAXDIM];
1005
Integer width[MAXDIM];
1006
Integer dims[MAXDIM];
1007
Integer lo_loc[MAXDIM], hi_loc[MAXDIM];
1008
Integer plo_loc[MAXDIM], phi_loc[MAXDIM];
1009
Integer tlo_rem[MAXDIM], thi_rem[MAXDIM];
1010
Integer slo_rem[MAXDIM], shi_rem[MAXDIM];
1011
Integer plo_rem[MAXDIM], phi_rem[MAXDIM];
1012
Integer ld_loc[MAXDIM], ld_rem[MAXDIM];
1013
int stride_loc[MAXDIM], stride_rem[MAXDIM],count[MAXDIM];
1014
char *ptr_loc, *ptr_rem;
1015
Integer me = ga_nodeid_();
1018
/* This routine makes use of the shift algorithm to update data in the
1019
* ghost cells bounding the local block of visible data. The shift
1020
* algorithm starts by updating the blocks of data along the first
1021
* dimension by grabbing a block of data that is width[0] deep but
1022
* otherwise matches the dimensions of the data residing on the
1023
* calling processor. The update of the second dimension, however,
1024
* grabs a block that is width[1] deep in the second dimension but is
1025
* ldim0 + 2*width[0] in the first dimensions where ldim0 is the
1026
* size of the visible data along the first dimension. The remaining
1027
* dimensions are left the same. For the next update, the width of the
1028
* second dimension is also increased by 2*width[1] and so on. This
1029
* algorith makes use of the fact that data for the dimensions that
1030
* have already been updated is available on each processor and can be
1031
* used in the updates of subsequent dimensions. The total number of
1032
* separate updates is 2*ndim, an update in the negative and positive
1033
* directions for each dimension. This implementation uses simple put
1034
* operations to perform the updates along each dimension with an
1035
* intervening synchronization call being used to make sure that the
1036
* necessary data is available on each processor before starting the
1037
* update along the next dimension.
1039
* To perform the update, this routine makes use of several copies of
1040
* indices marking the upper and lower limits of data. Indices
1041
* beginning with the character "p" are relative indices marking the
1042
* location of the data set relative to the origin the local patch of
1043
* the global array, all other indices are in absolute coordinates and
1044
* mark locations in the total global array. The indices used by this
1045
* routine are described below.
1047
* lo_loc[], hi_loc[]: The lower and upper indices of the visible
1048
* block of data held by the calling processor.
1050
* lo_rem[], hi_rem[]: The lower and upper indices of the block
1051
* of data on a remote processor or processors that is needed to
1052
* fill in the calling processors ghost cells. These indices are
1053
* NOT corrected for wrap-around (periodic) boundary conditions
1054
* so they can be negative or greater than the array dimension
1055
* values held in dims[].
1057
* slo_rem[], shi_rem[]: Similar to lo_rem[] and hi_rem[], except
1058
* that these indices have been corrected for wrap-around
1059
* boundary conditions.
1061
* thi_rem[], thi_rem[]: The lower and upper indices of the visible
1062
* data on a remote processor.
1064
* plo_loc[], phi_loc[]: The indices of the local data patch that
1065
* is going to be updated.
1067
* plo_rem[], phi_rem[]: The indices of the data patch on the
1068
* remote processor that will be used to update the data on the
1069
* calling processor. Note that the dimensions of the patches
1070
* represented by plo_loc[], plo_rem[] and plo_loc[], phi_loc[]
1074
/* if global array has no ghost cells, just return */
1075
if (!ga_has_ghosts_(g_a)) return TRUE;
1077
size = GA[handle].elemsize;
1078
ndim = GA[handle].ndim;
1079
p_handle = GA[handle].p_handle;
1081
/* obtain range of data that is held by local processor */
1082
nga_distribution_(g_a,&me,lo_loc,hi_loc);
1084
/* initialize range increments and get array dimensions */
1085
for (idx=0; idx < ndim; idx++) {
1087
width[idx] = (Integer)GA[handle].width[idx];
1088
dims[idx] = (Integer)GA[handle].dims[idx];
1089
if (lo_loc[idx] == 0 && hi_loc[idx] == -1) return FALSE;
1092
/* Check to make sure that global array is well-behaved (all processors
1093
have data and the width of the data in each dimension is greater
1094
than the corresponding value in width[]. */
1095
if (!gai_check_ghost_distr(g_a)) return FALSE;
1097
GA_PUSH_NAME("ga_update3_ghosts");
1099
/* Get pointer to local memory */
1100
ptr_loc = GA[handle].ptr[me];
1102
/* loop over dimensions for sequential update using shift algorithm */
1103
for (idx=0; idx < ndim; idx++) {
1104
nwidth = width[idx];
1106
/* Do not bother with update if nwidth is zero */
1109
/* Perform update in negative direction. */
1110
get_remote_block_neg(idx, ndim, lo_loc, hi_loc, slo_rem, shi_rem,
1112
/* locate processor with this data */
1113
if (!nga_locate_region_(g_a, slo_rem, shi_rem, _ga_map,
1114
GA_proclist, &np)) ga_RegionError(ga_ndim_(g_a),
1115
slo_rem, shi_rem, *g_a);
1117
/* Get actual coordinates of desired location of remote
1118
data as well as the actual coordinates of the local chunk
1119
of data that will be sent to remote processor (these
1120
coordinates take into account the presence of ghost
1121
cells). Start by finding out what data is actually held by
1122
remote processor. */
1123
proc_rem = GA_proclist[0];
1124
nga_distribution_(g_a, &proc_rem, tlo_rem, thi_rem);
1125
for (i = 0; i < ndim; i++) {
1126
if (increment[i] == 0) {
1128
plo_rem[i] = thi_rem[i] - tlo_rem[i] + width[i] + 1;
1129
phi_rem[i] = thi_rem[i] - tlo_rem[i] + 2*width[i];
1130
plo_loc[i] = width[i];
1131
phi_loc[i] = 2*width[i] - 1;
1133
plo_rem[i] = width[i];
1134
phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];
1135
plo_loc[i] = width[i];
1136
phi_loc[i] = hi_loc[i] - lo_loc[i] + width[i];
1140
phi_rem[i] = thi_rem[i] - tlo_rem[i] + increment[i];
1142
phi_loc[i] = hi_loc[i] - lo_loc[i] + increment[i];
1146
/* Get pointer to local data buffer and remote data
1147
buffer as well as lists of leading dimenstions */
1148
gam_LocationWithGhosts(me, handle, plo_loc, &ptr_loc, ld_loc);
1149
gam_LocationWithGhosts(proc_rem, handle, plo_rem, &ptr_rem, ld_rem);
1151
/* Evaluate strides on local and remote processors */
1152
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
1155
/* Compute the number of elements in each dimension and store
1156
result in count. Scale the first element in count by the
1158
gam_ComputeCount(ndim, plo_rem, phi_rem, count);
1161
/* Put local data on remote processor */
1162
if (p_handle >= 0) {
1163
proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem];
1165
ARMCI_PutS(ptr_loc, stride_loc, ptr_rem, stride_rem, count,
1166
(int)(ndim - 1), (int)proc_rem);
1168
/* Perform update in positive direction */
1169
get_remote_block_pos(idx, ndim, lo_loc, hi_loc, slo_rem, shi_rem,
1171
/* locate processor with this data */
1172
if (!nga_locate_region_(g_a, slo_rem, shi_rem, _ga_map,
1173
GA_proclist, &np)) ga_RegionError(ga_ndim_(g_a),
1174
slo_rem, shi_rem, *g_a);
1176
/* Get actual coordinates of desired chunk of remote
1177
data as well as the actual coordinates of the local chunk
1178
of data that will receive the remote data (these
1179
coordinates take into account the presence of ghost
1180
cells). Start by finding out what data is actually held by
1181
remote processor. */
1182
proc_rem = GA_proclist[0];
1183
nga_distribution_(g_a, &proc_rem, tlo_rem, thi_rem);
1184
for (i = 0; i < ndim; i++) {
1185
if (increment[i] == 0) {
1188
phi_rem[i] = width[i] - 1;
1189
plo_loc[i] = hi_loc[i] - lo_loc[i] + width[i] - 1;
1190
phi_loc[i] = hi_loc[i] - lo_loc[i] + 2*width[i] - 1;
1192
plo_rem[i] = width[i];
1193
phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];
1194
plo_loc[i] = width[i];
1195
phi_loc[i] = hi_loc[i] - lo_loc[i] + width[i];
1199
phi_rem[i] = thi_rem[i] - tlo_rem[i] + increment[i];
1201
phi_loc[i] = hi_loc[i] - lo_loc[i] + increment[i];
1205
/* Get pointer to local data buffer and remote data
1206
buffer as well as lists of leading dimenstions */
1207
gam_LocationWithGhosts(me, handle, plo_loc, &ptr_loc, ld_loc);
1208
gam_LocationWithGhosts(proc_rem, handle, plo_rem, &ptr_rem, ld_rem);
1210
/* Evaluate strides on local and remote processors */
1211
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
1214
/* Compute the number of elements in each dimension and store
1215
result in count. Scale the first element in count by the
1217
gam_ComputeCount(ndim, plo_rem, phi_rem, count);
1220
/* get remote data */
1221
if (p_handle >= 0) {
1222
proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem];
1224
ARMCI_PutS(ptr_loc, stride_loc, ptr_rem, stride_rem, count,
1225
(int)(ndim - 1), (int)proc_rem);
1227
/* synchronize all processors and update increment array */
1228
if (idx < ndim-1) ga_sync_();
1229
increment[idx] = 2*nwidth;
1236
#define GHOST_PRINT 0
1237
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY USING SHIFT ALGORITHM AND
1240
logical FATR ga_set_update4_info_(Integer *g_a)
1242
Integer idx, idir, i, np, handle=GA_OFFSET + *g_a;
1243
Integer size, buflen, buftot, *bufsize, ndim, increment[MAXDIM];
1244
Integer *proc_rem_snd, *proc_rem_rcv;
1246
Integer width[MAXDIM], dims[MAXDIM], index[MAXDIM];
1247
Integer lo_loc[MAXDIM], hi_loc[MAXDIM];
1248
Integer plo_snd[MAXDIM], phi_snd[MAXDIM];
1249
Integer lo_rcv[MAXDIM], hi_rcv[MAXDIM];
1250
Integer slo_rcv[MAXDIM], shi_rcv[MAXDIM];
1251
Integer plo_rcv[MAXDIM], phi_rcv[MAXDIM];
1252
Integer ld_loc[MAXDIM];
1253
int *stride_snd, *stride_rcv, *count, cache_size;
1255
char **ptr_snd, **ptr_rcv, *cache;
1257
Integer me = ga_nodeid_();
1260
/* This routine sets the arrays that are used to transfer data using
1261
* the update4. To perform the update, this routine makes use of several
1262
* copies of indices marking the upper and lower limits of data. Indices
1263
* beginning with the character "p" are relative indices marking the
1264
* location of the data set relative to the origin the local patch of
1265
* the global array, all other indices are in absolute coordinates and
1266
* mark locations in the total global array. The indices used by this
1267
* routine are described below.
1269
* lo_loc[], hi_loc[]: The lower and upper indices of the visible
1270
* block of data held by the calling processor.
1272
* lo_rcv[], hi_rcv[]: The lower and upper indices of the blocks
1273
* of data that will be either sent to or received from a remote
1274
* processor. These indices are NOT corrected for wrap-around
1275
* (periodic) boundary conditions so they can be negative or greater
1276
* than the array dimension values held in dims[].
1278
* slo_rcv[], shi_rcv[]: Similar to lo_rcv[] and hi_rcv[], except
1279
* that these indices have been corrected for wrap-around
1280
* boundary conditions.
1282
* plo_rcv[], phi_rcv[]: The local indices of the local data patch
1283
* that receive that message from the remote processor.
1285
* plo_snd[], phi_snd[]: The local indices of the data patch
1286
* that will be sent to the remote processor. Note that the
1287
* dimensions of the patches represented by plo_rec[], plo_rec[] and
1288
* plo_snd[], phi_snd[] must be the same.
1291
/* if global array has no ghost cells, just return */
1292
if (!ga_has_ghosts_(g_a)) return TRUE;
1294
/* Check to make sure that global array is well-behaved (all processors
1295
have data and the width of the data in each dimension is greater
1296
than the corresponding value in width[]. */
1297
if (!gai_check_ghost_distr(g_a)) return FALSE;
1299
size = GA[handle].elemsize;
1300
ndim = GA[handle].ndim;
1301
p_handle = GA[handle].p_handle;
1302
cache_size = 2*sizeof(char *)+3*ndim*sizeof(int)+3*sizeof(Integer);
1303
cache_size = 2* ndim *((cache_size/8) + 1) + 1;
1304
GA[handle].cache = (double *)malloc(sizeof(double)*cache_size);
1305
cache = (char *)GA[handle].cache;
1306
corner_flag = GA[handle].corner_flag;
1308
printf("p[%d]a cache_size: %d\n",GAme,cache_size);
1311
/* initialize range increments and get array dimensions */
1313
nga_distribution_(g_a,&me,lo_loc,hi_loc);
1314
for (idx=0; idx < ndim; idx++) {
1316
width[idx] = (Integer)GA[handle].width[idx];
1317
dims[idx] = (Integer)GA[handle].dims[idx];
1318
if (lo_loc[idx] == 0 && hi_loc[idx] == -1) {
1319
*(char**)cache = NULL;
1324
/* Get indices of processor in virtual grid */
1325
nga_proc_topology_(g_a, &me, index);
1327
/* Try to find maximum size of message that will be sent during
1328
* update operations and use this to allocate memory for message
1329
* passing buffers. */
1331
for (i=0; i<ndim; i++) {
1332
buftot *= (hi_loc[i]-lo_loc[i] + 1 + 2*width[i]);
1335
for (i = 0; i < ndim; i++) {
1336
idir = hi_loc[i] - lo_loc[i] + 1;
1337
if (buflen < (buftot/(idir + 2*width[i]))*width[i]) {
1338
buflen = (buftot/(idir + 2*width[i]))*width[i];
1341
bufsize = (Integer*)cache;
1343
printf("p[%d]a initial pointer: %d\n",GAme,(Integer)bufsize);
1346
current = (char*)(bufsize+1);
1348
*bufsize = size*buflen;
1350
printf("p[%d]a buflen: %d size: %d bufsize: %d\n",GAme,buflen,size,*bufsize);
1354
/* loop over dimensions for sequential update using shift algorithm */
1355
for (idx=0; idx < ndim; idx++) {
1357
/* Do not bother with update if nwidth is zero */
1358
if (width[idx] != 0) {
1360
ptr_snd = (char**)current;
1361
ptr_rcv = (char**)(ptr_snd+1);
1362
proc_rem_snd = (Integer*)(ptr_rcv+1);
1363
proc_rem_rcv = (Integer*)(proc_rem_snd+1);
1364
stride_snd = (int*)(proc_rem_rcv+1);
1365
stride_rcv = (int*)(stride_snd+ndim);
1366
length = (Integer*)(stride_rcv+ndim);
1367
count = (int*)(length+1);
1368
current = (char*)(count+ndim);
1370
/* Perform update in negative direction. */
1371
get_remote_block_neg(idx, ndim, lo_loc, hi_loc, slo_rcv, shi_rcv,
1373
/* locate processor with this data */
1374
if (!nga_locate_region_(g_a, slo_rcv, shi_rcv, _ga_map,
1375
GA_proclist, &np)) ga_RegionError(ga_ndim_(g_a),
1376
slo_rcv, shi_rcv, *g_a);
1377
*proc_rem_snd = GA_proclist[0];
1378
if (p_handle >= 0) {
1379
*proc_rem_snd = PGRP_LIST[p_handle].inv_map_proc_list[*proc_rem_snd];
1382
/* Find processor from which data will be recieved */
1383
for (i = 0; i < ndim; i++) {
1385
lo_rcv[i] = hi_loc[i] + 1;
1386
hi_rcv[i] = hi_loc[i] + width[i];
1388
lo_rcv[i] = lo_loc[i];
1389
hi_rcv[i] = hi_loc[i];
1393
/* Account for boundaries, if necessary. */
1394
for (i=0; i<ndim; i++) {
1396
if (hi_rcv[i] > dims[i]) {
1398
shi_rcv[i] = width[i];
1400
slo_rcv[i] = lo_rcv[i];
1401
shi_rcv[i] = hi_rcv[i];
1404
slo_rcv[i] = lo_rcv[i];
1405
shi_rcv[i] = hi_rcv[i];
1408
/* locate processor with this data */
1409
if (!nga_locate_region_(g_a, slo_rcv, shi_rcv, _ga_map,
1410
GA_proclist, &np)) ga_RegionError(ga_ndim_(g_a),
1411
slo_rcv, shi_rcv, *g_a);
1412
*proc_rem_rcv = GA_proclist[0];
1413
if (p_handle >= 0) {
1414
*proc_rem_rcv = PGRP_LIST[p_handle].inv_map_proc_list[*proc_rem_rcv];
1417
/* Get actual coordinates of chunk of data that will be sent to
1418
* remote processor as well as coordinates of the array space that
1419
* will receive data from remote processor. */
1420
for (i = 0; i < ndim; i++) {
1421
if (increment[i] == 0) {
1423
plo_snd[i] = width[i];
1424
phi_snd[i] = 2*width[i] - 1;
1425
plo_rcv[i] = hi_loc[i] - lo_loc[i] + width[i] + 1;
1426
phi_rcv[i] = hi_loc[i] - lo_loc[i] + 2*width[i];
1428
plo_snd[i] = width[i];
1429
phi_snd[i] = hi_loc[i] - lo_loc[i] + width[i];
1430
plo_rcv[i] = width[i];
1431
phi_rcv[i] = hi_loc[i] - lo_loc[i] + width[i];
1435
phi_rcv[i] = hi_loc[i] - lo_loc[i] + increment[i];
1437
phi_snd[i] = hi_loc[i] - lo_loc[i] + increment[i];
1441
/* Get pointer to local data buffer and remote data
1442
buffer as well as lists of leading dimenstions */
1443
gam_LocationWithGhosts(me, handle, plo_snd, ptr_snd, ld_loc);
1445
printf("p[%d]a 1: plo_snd[0]: %d plo_snd[1]: %d ptr_snd: %d\n",
1446
GAme, plo_snd[0], plo_snd[1], (Integer)*ptr_snd);
1449
gam_LocationWithGhosts(me, handle, plo_rcv, ptr_rcv, ld_loc);
1451
printf("p[%d]a 1: plo_rcv[0]: %d plo_rcv[1]: %d ptr_rcv: %d\n",
1452
GAme, plo_rcv[0], plo_rcv[1], (Integer)*ptr_rcv);
1456
/* Evaluate strides for send and recieve */
1457
gam_setstride(ndim, size, ld_loc, ld_loc, stride_rcv,
1460
/* Compute the number of elements in each dimension and store
1461
result in count. Scale the first element in count by the
1463
gam_ComputeCount(ndim, plo_rcv, phi_rcv, count);
1464
gam_CountElems(ndim, plo_snd, phi_snd, length);
1469
printf("p[%d]a 1: length: %d bufsize: %d proc_rem_snd: %d proc_rem_rcv: %d\n",
1470
GAme, *length, *bufsize, (int)*proc_rem_snd, (int)*proc_rem_rcv);
1471
printf("p[%d]a 1: count[0]: %d stride_rcv[0]: %d stride_rcv[1]: %d\n",
1472
GAme, count[0], stride_rcv[0],stride_rcv[1]);
1473
printf("p[%d]a 1: count[1]: %d stride_rcv[2]: %d stride_rcv[3]: %d\n",
1474
GAme, count[1], stride_rcv[2],stride_rcv[3]);
1475
printf("p[%d]a 1: count[2]: %d stride_snd[0]: %d stride_snd[1]: %d\n",
1476
GAme, count[2], stride_snd[0],stride_snd[1]);
1477
printf("p[%d]a 1: count[3]: %d stride_snd[2]: %d stride_snd[3]: %d\n",
1478
GAme, count[3], stride_snd[2],stride_snd[3]);
1482
ptr_snd = (char**)current;
1483
ptr_rcv = (char**)(ptr_snd+1);
1484
proc_rem_snd = (Integer*)(ptr_rcv+1);
1485
proc_rem_rcv = (Integer*)(proc_rem_snd+1);
1486
stride_snd = (int*)(proc_rem_rcv+1);
1487
stride_rcv = (int*)(stride_snd+ndim);
1488
length = (Integer*)(stride_rcv+ndim);
1489
count = (int*)(length+1);
1490
current = (char*)(count+ndim);
1492
/* Find parameters for message in positive direction. */
1493
get_remote_block_pos(idx, ndim, lo_loc, hi_loc, slo_rcv, shi_rcv,
1495
/* locate processor with this data */
1496
if (!nga_locate_region_(g_a, slo_rcv, shi_rcv, _ga_map,
1497
GA_proclist, &np)) ga_RegionError(ga_ndim_(g_a),
1498
slo_rcv, shi_rcv, *g_a);
1499
*proc_rem_snd = GA_proclist[0];
1500
if (p_handle >= 0) {
1501
*proc_rem_snd = PGRP_LIST[p_handle].inv_map_proc_list[*proc_rem_snd];
1504
/* Find processor from which data will be recieved */
1505
for (i = 0; i < ndim; i++) {
1507
lo_rcv[i] = lo_loc[i] - width[i];
1508
hi_rcv[i] = lo_loc[i] - 1;
1510
lo_rcv[i] = lo_loc[i];
1511
hi_rcv[i] = hi_loc[i];
1515
/* Account for boundaries, if necessary. */
1516
for (i=0; i<ndim; i++) {
1518
if (hi_rcv[i] < 1) {
1519
slo_rcv[i] = dims[i] - width[i] + 1;
1520
shi_rcv[i] = dims[i];
1522
slo_rcv[i] = lo_rcv[i];
1523
shi_rcv[i] = hi_rcv[i];
1526
slo_rcv[i] = lo_rcv[i];
1527
shi_rcv[i] = hi_rcv[i];
1530
/* locate processor with this data */
1531
if (!nga_locate_region_(g_a, slo_rcv, shi_rcv, _ga_map,
1532
GA_proclist, &np)) ga_RegionError(ga_ndim_(g_a),
1533
slo_rcv, shi_rcv, *g_a);
1534
*proc_rem_rcv = GA_proclist[0];
1535
if (p_handle >= 0) {
1536
*proc_rem_rcv = PGRP_LIST[p_handle].inv_map_proc_list[*proc_rem_rcv];
1538
/* Get actual coordinates of chunk of data that will be sent to
1539
* remote processor as well as coordinates of the array space that
1540
* will receive data from remote processor. */
1541
for (i = 0; i < ndim; i++) {
1542
if (increment[i] == 0) {
1544
plo_snd[i] = hi_loc[i] - lo_loc[i] + 1;
1545
phi_snd[i] = hi_loc[i] - lo_loc[i] + width[i];
1547
phi_rcv[i] = width[i] - 1;
1549
plo_snd[i] = width[i];
1550
phi_snd[i] = hi_loc[i] - lo_loc[i] + width[i];
1551
plo_rcv[i] = width[i];
1552
phi_rcv[i] = hi_loc[i] - lo_loc[i] + width[i];
1556
phi_rcv[i] = hi_loc[i] - lo_loc[i] + increment[i];
1558
phi_snd[i] = hi_loc[i] - lo_loc[i] + increment[i];
1562
/* Get pointer to local data buffer and remote data
1563
buffer as well as lists of leading dimenstions */
1564
gam_LocationWithGhosts(me, handle, plo_snd, ptr_snd, ld_loc);
1566
printf("p[%d]a 2: plo_snd[0]: %d plo_snd[1]: %d ptr_snd: %d\n",
1567
GAme, plo_snd[0], plo_snd[1], (Integer)*ptr_snd);
1570
gam_LocationWithGhosts(me, handle, plo_rcv, ptr_rcv, ld_loc);
1572
printf("p[%d]a 2: plo_rcv[0]: %d plo_rcv[1]: %d ptr_rcv: %d\n",
1573
GAme, plo_rcv[0], plo_rcv[1], (Integer)*ptr_rcv);
1577
/* Evaluate strides for send and recieve */
1578
gam_setstride(ndim, size, ld_loc, ld_loc, stride_rcv,
1581
/* Compute the number of elements in each dimension and store
1582
result in count. Scale the first element in count by the
1584
gam_ComputeCount(ndim, plo_rcv, phi_rcv, count);
1585
gam_CountElems(ndim, plo_snd, phi_snd, length);
1589
printf("p[%d]a 2: length: %d bufsize: %d proc_rem_snd: %d proc_rem_rcv: %d\n",
1590
GAme, *length, *bufsize, (int)*proc_rem_snd, (int)*proc_rem_rcv);
1591
printf("p[%d]a 2: count[0]: %d stride_rcv[0]: %d stride_rcv[1]: %d\n",
1592
GAme, count[0], stride_rcv[0],stride_rcv[1]);
1593
printf("p[%d]a 2: count[1]: %d stride_rcv[2]: %d stride_rcv[3]: %d\n",
1594
GAme, count[1], stride_rcv[2],stride_rcv[3]);
1595
printf("p[%d]a 2: count[2]: %d stride_snd[0]: %d stride_snd[1]: %d\n",
1596
GAme, count[2], stride_snd[0],stride_snd[1]);
1597
printf("p[%d]a 2: count[3]: %d stride_snd[2]: %d stride_snd[3]: %d\n",
1598
GAme, count[3], stride_snd[2],stride_snd[3]);
1603
if (GA[handle].corner_flag)
1604
increment[idx] = 2*width[idx];
1607
printf("p[%d]a final pointer: %d\n",GAme,(Integer)(Integer*)current);
1613
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY USING SHIFT ALGORITHM AND
1616
logical FATR ga_update4_ghosts_(Integer *g_a)
1618
Integer idx, i, handle=GA_OFFSET + *g_a;
1619
Integer *size, bufsize, buflen, ndim, elemsize;
1620
Integer *proc_rem_snd, *proc_rem_rcv, pmax;
1621
Integer msgcnt, *length, msglen;
1622
Integer index[MAXDIM], width[MAXDIM];
1623
int *stride_snd, *stride_rcv, *count;
1624
char **ptr_snd, **ptr_rcv, *cache, *current;
1625
char send_name[32], rcv_name[32];
1626
void *snd_ptr, *rcv_ptr, *snd_ptr_orig, *rcv_ptr_orig;
1627
Integer me = ga_nodeid_();
1630
/* This routine makes use of the shift algorithm to update data in the
1631
* ghost cells bounding the local block of visible data. The shift
1632
* algorithm starts by updating the blocks of data along the first
1633
* dimension by grabbing a block of data that is width[0] deep but
1634
* otherwise matches the dimensions of the data residing on the
1635
* calling processor. The update of the second dimension, however,
1636
* grabs a block that is width[1] deep in the second dimension but is
1637
* ldim0 + 2*width[0] in the first dimensions where ldim0 is the
1638
* size of the visible data along the first dimension. The remaining
1639
* dimensions are left the same. For the next update, the width of the
1640
* second dimension is also increased by 2*width[1] and so on. This
1641
* algorith makes use of the fact that data for the dimensions that
1642
* have already been updated is available on each processor and can be
1643
* used in the updates of subsequent dimensions. The total number of
1644
* separate updates is 2*ndim, an update in the negative and positive
1645
* directions for each dimension.
1647
* This implementation make use of explicit message passing to perform
1648
* the update. Separate message types for the updates in each coordinate
1649
* direction are used to maintain synchronization locally and to
1650
* guarantee that the data is present before the updates in a new
1651
* coordinate direction take place.
1654
/* if global array has no ghost cells, just return */
1655
if (!ga_has_ghosts_(g_a)) return TRUE;
1657
ndim = GA[handle].ndim;
1658
p_handle = GA[handle].p_handle;
1659
cache = (char *)GA[handle].cache;
1660
elemsize = GA[handle].elemsize;
1661
for (i=0; i<ndim; i++) {
1662
width[i] = (Integer)GA[handle].width[i];
1665
GA_PUSH_NAME("ga_update4_ghosts");
1668
/* Get indices of processor in virtual grid */
1669
nga_proc_topology_(g_a, &me, index);
1671
size = (Integer*)cache;
1672
current = (char*)(size+1);
1674
buflen = bufsize/elemsize;
1676
printf("p[%d] bufsize: %d buflen: %d\n", GAme, bufsize, buflen);
1680
strcpy(send_name,"send_buffer");
1681
strcpy(rcv_name,"receive_buffer");
1682
snd_ptr_orig = snd_ptr = ga_malloc(buflen, GA[handle].type, send_name);
1683
rcv_ptr_orig = rcv_ptr = ga_malloc(buflen, GA[handle].type, rcv_name);
1685
/* loop over dimensions for sequential update using shift algorithm */
1686
for (idx=0; idx < ndim; idx++) {
1688
/* Do not bother with update if nwidth is zero */
1689
if (width[idx] != 0) {
1691
/* send messages in negative direction */
1692
snd_ptr = snd_ptr_orig;
1693
rcv_ptr = rcv_ptr_orig;
1695
ptr_snd = (char**)current;
1696
ptr_rcv = (char**)(ptr_snd+1);
1697
proc_rem_snd = (Integer*)(ptr_rcv+1);
1698
proc_rem_rcv = (Integer*)(proc_rem_snd+1);
1699
stride_snd = (int*)(proc_rem_rcv+1);
1700
stride_rcv = (int*)(stride_snd+ndim);
1701
length = (Integer*)(stride_rcv+ndim);
1702
count = (int*)(length+1);
1703
current = (char*)(count+ndim);
1706
printf("p[%d] 1: ptr_snd: %d ptr_rcv: %d\n", GAme, (Integer)*ptr_snd,
1708
printf("p[%d] 1: length: %d proc_rem_snd: %d proc_rem_rcv: %d\n",
1709
GAme, (int)*length, (int)*proc_rem_snd, (int)*proc_rem_rcv);
1710
printf("p[%d] 1: count[0]: %d stride_rcv[0]: %d stride_rcv[1]: %d\n",
1711
GAme, count[0], stride_rcv[0],stride_rcv[1]);
1712
printf("p[%d] 1: count[1]: %d stride_rcv[2]: %d stride_rcv[3]: %d\n",
1713
GAme, count[1], stride_rcv[2],stride_rcv[3]);
1714
printf("p[%d] 1: count[2]: %d stride_snd[0]: %d stride_snd[1]: %d\n",
1715
GAme, count[2], stride_snd[0],stride_snd[1]);
1716
printf("p[%d] 1: count[3]: %d stride_snd[2]: %d stride_snd[3]: %d\n",
1717
GAme, count[3], stride_snd[2],stride_snd[3]);
1718
printf("p[%d] 1: snd_ptr: %d rcv_ptr: %d\n", GAme, (Integer)snd_ptr,
1722
/* Fill send buffer with data. */
1723
armci_write_strided(*ptr_snd, (int)ndim-1, stride_snd, count, snd_ptr);
1725
printf("p[%d] completed armci_write_strided\n",GAme);
1729
/* Send Messages. If processor has odd index in direction idx, it
1730
* sends message first, if processor has even index it receives
1731
* message first. Then process is reversed. Also need to account
1732
* for whether or not there are an odd number of processors along
1733
* update direction. */
1735
if (GAme != *proc_rem_snd) {
1736
if (GA[handle].nblock[idx]%2 == 0) {
1737
if (index[idx]%2 != 0) {
1738
armci_msg_snd(msgcnt, snd_ptr, *length, *proc_rem_snd);
1740
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, *proc_rem_rcv);
1742
if (index[idx]%2 != 0) {
1743
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, *proc_rem_rcv);
1745
armci_msg_snd(msgcnt, snd_ptr, *length, *proc_rem_snd);
1748
pmax = GA[handle].nblock[idx] - 1;
1749
if (index[idx]%2 != 0) {
1750
armci_msg_snd(msgcnt, snd_ptr, *length, *proc_rem_snd);
1751
} else if (index[idx] != pmax) {
1752
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, *proc_rem_rcv);
1754
if (index[idx]%2 != 0) {
1755
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, *proc_rem_rcv);
1756
} else if (index[idx] != 0) {
1757
armci_msg_snd(msgcnt, snd_ptr, *length, *proc_rem_snd);
1759
/* make up for odd processor at end of string */
1760
if (index[idx] == 0) {
1761
armci_msg_snd(msgcnt, snd_ptr, *length, *proc_rem_snd);
1763
if (index[idx] == pmax) {
1764
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, *proc_rem_rcv);
1771
/* copy data back into global array */
1772
armci_read_strided(*ptr_rcv, (int)ndim-1, stride_rcv, count, rcv_ptr);
1774
printf("p[%d] completed armci_read_strided\n",GAme);
1778
/* send messages in positive direction */
1779
snd_ptr = snd_ptr_orig;
1780
rcv_ptr = rcv_ptr_orig;
1782
ptr_snd = (char**)current;
1783
ptr_rcv = (char**)(ptr_snd+1);
1784
proc_rem_snd = (Integer*)(ptr_rcv+1);
1785
proc_rem_rcv = (Integer*)(proc_rem_snd+1);
1786
stride_snd = (int*)(proc_rem_rcv+1);
1787
stride_rcv = (int*)(stride_snd+ndim);
1788
length = (Integer*)(stride_rcv+ndim);
1789
count = (int*)(length+1);
1790
current = (char*)(count+ndim);
1793
printf("p[%d] 2: ptr_snd: %d ptr_rcv: %d\n", GAme, (Integer)*ptr_snd,
1795
printf("p[%d] 2: length: %d proc_rem_snd: %d proc_rem_rcv: %d\n",
1796
GAme, (int)*length, (int)*proc_rem_snd, (int)*proc_rem_rcv);
1797
printf("p[%d] 2: count[0]: %d stride_rcv[0]: %d stride_rcv[1]: %d\n",
1798
GAme, count[0], stride_rcv[0],stride_rcv[1]);
1799
printf("p[%d] 2: count[1]: %d stride_rcv[2]: %d stride_rcv[3]: %d\n",
1800
GAme, count[1], stride_rcv[2],stride_rcv[3]);
1801
printf("p[%d] 2: count[2]: %d stride_snd[0]: %d stride_snd[1]: %d\n",
1802
GAme, count[2], stride_snd[0],stride_snd[1]);
1803
printf("p[%d] 2: count[3]: %d stride_snd[2]: %d stride_snd[3]: %d\n",
1804
GAme, count[3], stride_snd[2],stride_snd[3]);
1805
printf("p[%d] 2: snd_ptr: %d rcv_ptr: %d\n", GAme, (Integer)snd_ptr,
1809
/* Fill send buffer with data. */
1810
armci_write_strided(*ptr_snd, (int)ndim-1, stride_snd, count, snd_ptr);
1812
/* Send Messages. If processor has odd index in direction idx, it
1813
* sends message first, if processor has even index it receives
1814
* message first. Then process is reversed. Also need to account
1815
* for whether or not there are an odd number of processors along
1816
* update direction. */
1818
if (GAme != *proc_rem_rcv) {
1819
if (GA[handle].nblock[idx]%2 == 0) {
1820
if (index[idx]%2 != 0) {
1821
armci_msg_snd(msgcnt, snd_ptr, *length, *proc_rem_snd);
1823
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, *proc_rem_rcv);
1825
if (index[idx]%2 != 0) {
1826
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, *proc_rem_rcv);
1828
armci_msg_snd(msgcnt, snd_ptr, *length, *proc_rem_snd);
1831
pmax = GA[handle].nblock[idx] - 1;
1832
if (index[idx]%2 != 0) {
1833
armci_msg_snd(msgcnt, snd_ptr, *length, *proc_rem_snd);
1834
} else if (index[idx] != 0) {
1835
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, *proc_rem_rcv);
1837
if (index[idx]%2 != 0) {
1838
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, *proc_rem_rcv);
1839
} else if (index[idx] != pmax) {
1840
armci_msg_snd(msgcnt, snd_ptr, *length, *proc_rem_snd);
1842
/* make up for odd processor at end of string */
1843
if (index[idx] == pmax) {
1844
armci_msg_snd(msgcnt, snd_ptr, length, *proc_rem_snd);
1846
if (index[idx] == 0) {
1847
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, *proc_rem_rcv);
1853
/* copy data back into global array */
1854
armci_read_strided(*ptr_rcv, (int)ndim-1, stride_rcv, count, rcv_ptr);
1859
ga_free(rcv_ptr_orig);
1860
ga_free(snd_ptr_orig);
1865
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY USING SHIFT ALGORITHM AND
1868
logical FATR ga_update44_ghosts_(Integer *g_a)
1870
Integer idx, idir, i, np, handle=GA_OFFSET + *g_a;
1871
Integer size, buflen, buftot, bufsize, ndim, increment[MAXDIM];
1872
Integer proc_rem_snd, proc_rem_rcv, pmax;
1873
Integer msgcnt, length, msglen;
1874
Integer width[MAXDIM], dims[MAXDIM], index[MAXDIM];
1875
Integer lo_loc[MAXDIM], hi_loc[MAXDIM];
1876
Integer plo_snd[MAXDIM], phi_snd[MAXDIM];
1877
Integer lo_rcv[MAXDIM], hi_rcv[MAXDIM];
1878
Integer slo_rcv[MAXDIM], shi_rcv[MAXDIM];
1879
Integer plo_rcv[MAXDIM], phi_rcv[MAXDIM];
1880
Integer ld_loc[MAXDIM];
1881
int stride_snd[MAXDIM], stride_rcv[MAXDIM],count[MAXDIM];
1882
char *ptr_snd, *ptr_rcv;
1883
char send_name[32], rcv_name[32];
1884
void *snd_ptr, *rcv_ptr, *snd_ptr_orig, *rcv_ptr_orig;
1885
Integer me = ga_nodeid_();
1888
/* This routine makes use of the shift algorithm to update data in the
1889
* ghost cells bounding the local block of visible data. The shift
1890
* algorithm starts by updating the blocks of data along the first
1891
* dimension by grabbing a block of data that is width[0] deep but
1892
* otherwise matches the dimensions of the data residing on the
1893
* calling processor. The update of the second dimension, however,
1894
* grabs a block that is width[1] deep in the second dimension but is
1895
* ldim0 + 2*width[0] in the first dimensions where ldim0 is the
1896
* size of the visible data along the first dimension. The remaining
1897
* dimensions are left the same. For the next update, the width of the
1898
* second dimension is also increased by 2*width[1] and so on. This
1899
* algorith makes use of the fact that data for the dimensions that
1900
* have already been updated is available on each processor and can be
1901
* used in the updates of subsequent dimensions. The total number of
1902
* separate updates is 2*ndim, an update in the negative and positive
1903
* directions for each dimension.
1905
* This implementation make use of explicit message passing to perform
1906
* the update. Separate message types for the updates in each coordinate
1907
* direction are used to maintain synchronization locally and to
1908
* guarantee that the data is present before the updates in a new
1909
* coordinate direction take place.
1911
* To perform the update, this routine makes use of several copies of
1912
* indices marking the upper and lower limits of data. Indices
1913
* beginning with the character "p" are relative indices marking the
1914
* location of the data set relative to the origin the local patch of
1915
* the global array, all other indices are in absolute coordinates and
1916
* mark locations in the total global array. The indices used by this
1917
* routine are described below.
1919
* lo_loc[], hi_loc[]: The lower and upper indices of the visible
1920
* block of data held by the calling processor.
1922
* lo_rcv[], hi_rcv[]: The lower and upper indices of the blocks
1923
* of data that will be either sent to or received from a remote
1924
* processor. These indices are NOT corrected for wrap-around
1925
* (periodic) boundary conditions so they can be negative or greater
1926
* than the array dimension values held in dims[].
1928
* slo_rcv[], shi_rcv[]: Similar to lo_rcv[] and hi_rcv[], except
1929
* that these indices have been corrected for wrap-around
1930
* boundary conditions.
1932
* plo_rcv[], phi_rcv[]: The local indices of the local data patch
1933
* that receive that message from the remote processor.
1935
* plo_snd[], phi_snd[]: The local indices of the data patch
1936
* that will be sent to the remote processor. Note that the
1937
* dimensions of the patches represented by plo_rec[], plo_rec[] and
1938
* plo_snd[], phi_snd[] must be the same.
1941
/* if global array has no ghost cells, just return */
1942
if (!ga_has_ghosts_(g_a)) return TRUE;
1944
size = GA[handle].elemsize;
1945
ndim = GA[handle].ndim;
1946
p_handle = GA[handle].p_handle;
1948
/* initialize range increments and get array dimensions */
1949
for (idx=0; idx < ndim; idx++) {
1951
width[idx] = (Integer)GA[handle].width[idx];
1952
dims[idx] = (Integer)GA[handle].dims[idx];
1955
/* Check to make sure that global array is well-behaved (all processors
1956
have data and the width of the data in each dimension is greater
1957
than the corresponding value in width[]. */
1958
if (!gai_check_ghost_distr(g_a)) return FALSE;
1960
GA_PUSH_NAME("ga_update4_ghosts");
1963
/* obtain range of data that is held by local processor */
1964
nga_distribution_(g_a,&me,lo_loc,hi_loc);
1965
/* Get indices of processor in virtual grid */
1966
nga_proc_topology_(g_a, &me, index);
1968
/* Try to find maximum size of message that will be sent during
1969
* update operations and use this to allocate memory for message
1970
* passing buffers. */
1972
for (i=0; i<ndim; i++) {
1973
buftot *= (hi_loc[i]-lo_loc[i] + 1 + 2*width[i]);
1976
for (i = 0; i < ndim; i++) {
1977
idir = hi_loc[i] - lo_loc[i] + 1;
1978
if (buflen < (buftot/(idir + 2*width[i]))*width[i]) {
1979
buflen = (buftot/(idir + 2*width[i]))*width[i];
1982
bufsize = size*buflen;
1983
strcpy(send_name,"send_buffer");
1984
strcpy(rcv_name,"receive_buffer");
1985
snd_ptr_orig = snd_ptr = ga_malloc(buflen, GA[handle].type, send_name);
1986
rcv_ptr_orig = rcv_ptr = ga_malloc(buflen, GA[handle].type, rcv_name);
1988
/* loop over dimensions for sequential update using shift algorithm */
1989
for (idx=0; idx < ndim; idx++) {
1991
/* Do not bother with update if nwidth is zero */
1992
if (width[idx] != 0) {
1994
/* Perform update in negative direction. */
1995
get_remote_block_neg(idx, ndim, lo_loc, hi_loc, slo_rcv, shi_rcv,
1997
/* locate processor with this data */
1998
if (!nga_locate_region_(g_a, slo_rcv, shi_rcv, _ga_map,
1999
GA_proclist, &np)) ga_RegionError(ga_ndim_(g_a),
2000
slo_rcv, shi_rcv, *g_a);
2001
proc_rem_snd = GA_proclist[0];
2002
if (p_handle >= 0) {
2003
proc_rem_snd = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem_snd];
2006
/* Find processor from which data will be recieved */
2007
for (i = 0; i < ndim; i++) {
2009
lo_rcv[i] = hi_loc[i] + 1;
2010
hi_rcv[i] = hi_loc[i] + width[i];
2012
lo_rcv[i] = lo_loc[i];
2013
hi_rcv[i] = hi_loc[i];
2017
/* Account for boundaries, if necessary. */
2018
for (i=0; i<ndim; i++) {
2020
if (hi_rcv[i] > dims[i]) {
2022
shi_rcv[i] = width[i];
2024
slo_rcv[i] = lo_rcv[i];
2025
shi_rcv[i] = hi_rcv[i];
2028
slo_rcv[i] = lo_rcv[i];
2029
shi_rcv[i] = hi_rcv[i];
2032
/* locate processor with this data */
2033
if (!nga_locate_region_(g_a, slo_rcv, shi_rcv, _ga_map,
2034
GA_proclist, &np)) ga_RegionError(ga_ndim_(g_a),
2035
slo_rcv, shi_rcv, *g_a);
2036
proc_rem_rcv = GA_proclist[0];
2037
if (p_handle >= 0) {
2038
proc_rem_rcv = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem_rcv];
2041
/* Get actual coordinates of chunk of data that will be sent to
2042
* remote processor as well as coordinates of the array space that
2043
* will receive data from remote processor. */
2044
for (i = 0; i < ndim; i++) {
2045
if (increment[i] == 0) {
2047
plo_snd[i] = width[i];
2048
phi_snd[i] = 2*width[i] - 1;
2049
plo_rcv[i] = hi_loc[i] - lo_loc[i] + width[i] + 1;
2050
phi_rcv[i] = hi_loc[i] - lo_loc[i] + 2*width[i];
2052
plo_snd[i] = width[i];
2053
phi_snd[i] = hi_loc[i] - lo_loc[i] + width[i];
2054
plo_rcv[i] = width[i];
2055
phi_rcv[i] = hi_loc[i] - lo_loc[i] + width[i];
2059
phi_rcv[i] = hi_loc[i] - lo_loc[i] + increment[i];
2061
phi_snd[i] = hi_loc[i] - lo_loc[i] + increment[i];
2065
/* Get pointer to local data buffer and remote data
2066
buffer as well as lists of leading dimenstions */
2067
gam_LocationWithGhosts(me, handle, plo_snd, &ptr_snd, ld_loc);
2069
printf("p[%d] 1: plo_snd[0]: %d plo_snd[1]: %d ptr_snd: %d\n",
2070
GAme, plo_snd[0], plo_snd[1], (Integer)ptr_snd);
2073
gam_LocationWithGhosts(me, handle, plo_rcv, &ptr_rcv, ld_loc);
2075
printf("p[%d] 1: plo_rcv[0]: %d plo_rcv[1]: %d ptr_rcv: %d\n",
2076
GAme, plo_rcv[0], plo_rcv[1], (Integer)ptr_rcv);
2080
/* Evaluate strides for send and recieve */
2081
gam_setstride(ndim, size, ld_loc, ld_loc, stride_rcv,
2084
/* Compute the number of elements in each dimension and store
2085
result in count. Scale the first element in count by the
2087
gam_ComputeCount(ndim, plo_rcv, phi_rcv, count);
2088
gam_CountElems(ndim, plo_snd, phi_snd, &length);
2092
/* Fill send buffer with data. */
2094
printf("p[%d]b 1: ptr_snd: %d ptr_rcv: %d\n", GAme, (Integer)ptr_snd,
2096
printf("p[%d]b 1: length: %d proc_rem_snd: %d proc_rem_rcv: %d\n",
2097
GAme, (int)length, (int)proc_rem_snd, (int)proc_rem_rcv);
2098
printf("p[%d]b 1: count[0]: %d stride_rcv[0]: %d stride_rcv[1]: %d\n",
2099
GAme, count[0], stride_rcv[0],stride_rcv[1]);
2100
printf("p[%d]b 1: count[1]: %d stride_rcv[2]: %d stride_rcv[3]: %d\n",
2101
GAme, count[1], stride_rcv[2],stride_rcv[3]);
2102
printf("p[%d]b 1: count[2]: %d stride_snd[0]: %d stride_snd[1]: %d\n",
2103
GAme, count[2], stride_snd[0],stride_snd[1]);
2104
printf("p[%d]b 1: count[3]: %d stride_snd[2]: %d stride_snd[3]: %d\n",
2105
GAme, count[3], stride_snd[2],stride_snd[3]);
2106
printf("p[%d]b 1: snd_ptr: %d rcv_ptr: %d\n", GAme, (Integer)snd_ptr,
2110
armci_write_strided(ptr_snd, (int)ndim-1, stride_snd, count, snd_ptr);
2112
/* Send Messages. If processor has odd index in direction idx, it
2113
* sends message first, if processor has even index it receives
2114
* message first. Then process is reversed. Also need to account
2115
* for whether or not there are an odd number of processors along
2116
* update direction. */
2119
printf("p[%d] 1: msgcnt: %d length: %d bufsize: %d proc_rem_snd: %d proc_rem_rcv: %d\n",
2120
GAme, msgcnt, length, bufsize, (int)proc_rem_snd, (int)proc_rem_rcv);
2123
snd_ptr = snd_ptr_orig;
2124
rcv_ptr = rcv_ptr_orig;
2125
if (GAme != proc_rem_snd) {
2126
if (GA[handle].nblock[idx]%2 == 0) {
2127
if (index[idx]%2 != 0) {
2128
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
2130
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
2132
if (index[idx]%2 != 0) {
2133
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
2135
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
2138
pmax = GA[handle].nblock[idx] - 1;
2139
if (index[idx]%2 != 0) {
2140
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
2141
} else if (index[idx] != pmax) {
2142
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
2144
if (index[idx]%2 != 0) {
2145
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
2146
} else if (index[idx] != 0) {
2147
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
2149
/* make up for odd processor at end of string */
2150
if (index[idx] == 0) {
2151
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
2153
if (index[idx] == pmax) {
2154
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
2161
/* copy data back into global array */
2162
armci_read_strided(ptr_rcv, (int)ndim-1, stride_rcv, count, rcv_ptr);
2164
/* Find parameters for message in positive direction. */
2165
get_remote_block_pos(idx, ndim, lo_loc, hi_loc, slo_rcv, shi_rcv,
2167
/* locate processor with this data */
2168
if (!nga_locate_region_(g_a, slo_rcv, shi_rcv, _ga_map,
2169
GA_proclist, &np)) ga_RegionError(ga_ndim_(g_a),
2170
slo_rcv, shi_rcv, *g_a);
2171
proc_rem_snd = GA_proclist[0];
2172
if (p_handle >= 0) {
2173
proc_rem_snd = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem_snd];
2176
/* Find processor from which data will be recieved */
2177
for (i = 0; i < ndim; i++) {
2179
lo_rcv[i] = lo_loc[i] - width[i];
2180
hi_rcv[i] = lo_loc[i] - 1;
2182
lo_rcv[i] = lo_loc[i];
2183
hi_rcv[i] = hi_loc[i];
2187
/* Account for boundaries, if necessary. */
2188
for (i=0; i<ndim; i++) {
2190
if (hi_rcv[i] < 1) {
2191
slo_rcv[i] = dims[i] - width[i] + 1;
2192
shi_rcv[i] = dims[i];
2194
slo_rcv[i] = lo_rcv[i];
2195
shi_rcv[i] = hi_rcv[i];
2198
slo_rcv[i] = lo_rcv[i];
2199
shi_rcv[i] = hi_rcv[i];
2202
/* locate processor with this data */
2203
if (!nga_locate_region_(g_a, slo_rcv, shi_rcv, _ga_map,
2204
GA_proclist, &np)) ga_RegionError(ga_ndim_(g_a),
2205
slo_rcv, shi_rcv, *g_a);
2206
proc_rem_rcv = GA_proclist[0];
2207
if (p_handle >= 0) {
2208
proc_rem_rcv = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem_rcv];
2210
/* Get actual coordinates of chunk of data that will be sent to
2211
* remote processor as well as coordinates of the array space that
2212
* will receive data from remote processor. */
2213
for (i = 0; i < ndim; i++) {
2214
if (increment[i] == 0) {
2216
plo_snd[i] = hi_loc[i] - lo_loc[i] + 1;
2217
phi_snd[i] = hi_loc[i] - lo_loc[i] + width[i];
2219
phi_rcv[i] = width[i] - 1;
2221
plo_snd[i] = width[i];
2222
phi_snd[i] = hi_loc[i] - lo_loc[i] + width[i];
2223
plo_rcv[i] = width[i];
2224
phi_rcv[i] = hi_loc[i] - lo_loc[i] + width[i];
2228
phi_rcv[i] = hi_loc[i] - lo_loc[i] + increment[i];
2230
phi_snd[i] = hi_loc[i] - lo_loc[i] + increment[i];
2234
/* Get pointer to local data buffer and remote data
2235
buffer as well as lists of leading dimenstions */
2236
gam_LocationWithGhosts(me, handle, plo_snd, &ptr_snd, ld_loc);
2238
printf("p[%d] 2: plo_snd[0]: %d plo_snd[1]: %d ptr_snd: %d\n",
2239
GAme, plo_snd[0], plo_snd[1], (Integer)ptr_snd);
2242
gam_LocationWithGhosts(me, handle, plo_rcv, &ptr_rcv, ld_loc);
2244
printf("p[%d] 2: plo_rcv[0]: %d plo_rcv[1]: %d ptr_rcv: %d\n",
2245
GAme, plo_rcv[0], plo_rcv[1], (Integer)ptr_rcv);
2249
/* Evaluate strides for send and recieve */
2250
gam_setstride(ndim, size, ld_loc, ld_loc, stride_rcv,
2253
/* Compute the number of elements in each dimension and store
2254
result in count. Scale the first element in count by the
2256
gam_ComputeCount(ndim, plo_rcv, phi_rcv, count);
2257
gam_CountElems(ndim, plo_snd, phi_snd, &length);
2261
/* Need to reallocate memory if length > buflen */
2264
/* Fill send buffer with data. */
2266
printf("p[%d]b 2: ptr_snd: %d ptr_rcv: %d\n", GAme, (Integer)ptr_snd,
2268
printf("p[%d]b 2: length: %d proc_rem_snd: %d proc_rem_rcv: %d\n",
2269
GAme, (int)length, (int)proc_rem_snd, (int)proc_rem_rcv);
2270
printf("p[%d]b 2: count[0]: %d stride_rcv[0]: %d stride_rcv[1]: %d\n",
2271
GAme, count[0], stride_rcv[0],stride_rcv[1]);
2272
printf("p[%d]b 2: count[1]: %d stride_rcv[2]: %d stride_rcv[3]: %d\n",
2273
GAme, count[1], stride_rcv[2],stride_rcv[3]);
2274
printf("p[%d]b 2: count[2]: %d stride_snd[0]: %d stride_snd[1]: %d\n",
2275
GAme, count[2], stride_snd[0],stride_snd[1]);
2276
printf("p[%d]b 2: count[3]: %d stride_snd[2]: %d stride_snd[3]: %d\n",
2277
GAme, count[3], stride_snd[2],stride_snd[3]);
2278
printf("p[%d]b 2: snd_ptr: %d rcv_ptr: %d\n", GAme, (Integer)snd_ptr,
2282
armci_write_strided(ptr_snd, (int)ndim-1, stride_snd, count, snd_ptr);
2284
/* Send Messages. If processor has odd index in direction idx, it
2285
* sends message first, if processor has even index it receives
2286
* message first. Then process is reversed. Also need to account
2287
* for whether or not there are an odd number of processors along
2288
* update direction. */
2291
printf("p[%d] 2: msgcnt: %d length: %d bufsize: %d proc_rem_snd: %d proc_rem_rcv: %d\n",
2292
GAme, msgcnt, length, bufsize, (int)proc_rem_snd, (int)proc_rem_rcv);
2295
snd_ptr = snd_ptr_orig;
2296
rcv_ptr = rcv_ptr_orig;
2297
if (GAme != proc_rem_rcv) {
2298
if (GA[handle].nblock[idx]%2 == 0) {
2299
if (index[idx]%2 != 0) {
2300
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
2302
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
2304
if (index[idx]%2 != 0) {
2305
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
2307
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
2310
pmax = GA[handle].nblock[idx] - 1;
2311
if (index[idx]%2 != 0) {
2312
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
2313
} else if (index[idx] != 0) {
2314
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
2316
if (index[idx]%2 != 0) {
2317
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
2318
} else if (index[idx] != pmax) {
2319
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
2321
/* make up for odd processor at end of string */
2322
if (index[idx] == pmax) {
2323
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
2325
if (index[idx] == 0) {
2326
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
2332
/* copy data back into global array */
2333
armci_read_strided(ptr_rcv, (int)ndim-1, stride_rcv, count, rcv_ptr);
2336
if (GA[handle].corner_flag)
2337
increment[idx] = 2*width[idx];
2340
ga_free(rcv_ptr_orig);
2341
ga_free(snd_ptr_orig);
2346
/* Utility function for ga_update5_ghosts routine */
2347
double waitforflags (int *ptr1, int *ptr2) {
2350
while (gai_getval(ptr1) == 0 || gai_getval(ptr2) == 0) {
2351
val = exp(-(double)i++);
2354
printf("%d: flags set at %p and %p\n",GAme,ptr1,ptr2); fflush(stdout);
2359
/* Stub in new ARMCI_PutS_flag call until actual implementation is
2361
int ARMCI_PutS_flag__(
2362
void* src_ptr, /* pointer to 1st segment at source */
2363
int src_stride_arr[], /* array of strides at source */
2364
void* dst_ptr, /* pointer to 1st segment at destination */
2365
int dst_stride_arr[], /* array of strides at destination */
2366
int count[], /* number of units at each stride level,
2367
count[0] = #bytes */
2368
int stride_levels, /* number of stride levels */
2369
int *flag, /* pointer to remote flag */
2370
int *val, /* pointer to value to set flag upon completion of
2372
int proc /* remote process(or) ID */
2376
/* Put local data on remote processor */
2377
ARMCI_PutS(src_ptr, src_stride_arr, dst_ptr, dst_stride_arr,
2378
count, stride_levels, proc);
2380
/* Send signal to remote processor that data transfer has
2381
* been completed. */
2382
bytes = sizeof(int);
2383
ARMCI_Put(val, flag, bytes, proc);
2387
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY USING SHIFT ALGORITHM AND PUT CALLS
2388
* WITHOUT ANY BARRIERS
2390
logical FATR ga_update55_ghosts_(Integer *g_a)
2392
Integer idx, i, np, handle=GA_OFFSET + *g_a, proc_rem;
2393
Integer size, ndim, nwidth, increment[MAXDIM];
2394
Integer width[MAXDIM];
2395
Integer dims[MAXDIM];
2396
Integer lo_loc[MAXDIM], hi_loc[MAXDIM];
2397
Integer plo_loc[MAXDIM], phi_loc[MAXDIM];
2398
Integer tlo_rem[MAXDIM], thi_rem[MAXDIM];
2399
Integer slo_rem[MAXDIM], shi_rem[MAXDIM];
2400
Integer plo_rem[MAXDIM], phi_rem[MAXDIM];
2401
Integer ld_loc[MAXDIM], ld_rem[MAXDIM];
2402
int stride_loc[MAXDIM], stride_rem[MAXDIM],count[MAXDIM];
2404
char *ptr_loc, *ptr_rem;
2405
Integer me = ga_nodeid_();
2408
/* This routine makes use of the shift algorithm to update data in the
2409
* ghost cells bounding the local block of visible data. The shift
2410
* algorithm starts by updating the blocks of data along the first
2411
* dimension by grabbing a block of data that is width[0] deep but
2412
* otherwise matches the dimensions of the data residing on the
2413
* calling processor. The update of the second dimension, however,
2414
* grabs a block that is width[1] deep in the second dimension but is
2415
* ldim0 + 2*width[0] in the first dimensions where ldim0 is the
2416
* size of the visible data along the first dimension. The remaining
2417
* dimensions are left the same. For the next update, the width of the
2418
* second dimension is also increased by 2*width[1] and so on. This
2419
* algorith makes use of the fact that data for the dimensions that
2420
* have already been updated is available on each processor and can be
2421
* used in the updates of subsequent dimensions. The total number of
2422
* separate updates is 2*ndim, an update in the negative and positive
2423
* directions for each dimension.
2425
* This operation is implemented using put calls to place the
2426
* appropriate data on remote processors. To signal the remote
2427
* processor that it has received the data, a second put call
2428
* consisting of a single integer is sent after the first put call and
2429
* used to update a signal buffer on the remote processor. Each
2430
* processor can determine how much data it has received by checking
2431
* its signal buffer.
2433
* To perform the update, this routine makes use of several copies of
2434
* indices marking the upper and lower limits of data. Indices
2435
* beginning with the character "p" are relative indices marking the
2436
* location of the data set relative to the origin the local patch of
2437
* the global array, all other indices are in absolute coordinates and
2438
* mark locations in the total global array. The indices used by this
2439
* routine are described below.
2441
* lo_loc[], hi_loc[]: The lower and upper indices of the visible
2442
* block of data held by the calling processor.
2444
* lo_rem[], hi_rem[]: The lower and upper indices of the block
2445
* of data on a remote processor or processors that is needed to
2446
* fill in the calling processors ghost cells. These indices are
2447
* NOT corrected for wrap-around (periodic) boundary conditions
2448
* so they can be negative or greater than the array dimension
2449
* values held in dims[].
2451
* slo_rem[], shi_rem[]: Similar to lo_rem[] and hi_rem[], except
2452
* that these indices have been corrected for wrap-around
2453
* boundary conditions.
2455
* thi_rem[], thi_rem[]: The lower and upper indices of the visible
2456
* data on a remote processor.
2458
* plo_loc[], phi_loc[]: The indices of the local data patch that
2459
* is going to be updated.
2461
* plo_rem[], phi_rem[]: The indices of the data patch on the
2462
* remote processor that will be used to update the data on the
2463
* calling processor. Note that the dimensions of the patches
2464
* represented by plo_loc[], plo_rem[] and plo_loc[], phi_loc[]
2468
/* if global array has no ghost cells, just return */
2469
if (!ga_has_ghosts_(g_a)) return TRUE;
2471
size = GA[handle].elemsize;
2472
ndim = GA[handle].ndim;
2473
p_handle = GA[handle].p_handle;
2475
/* initialize range increments and get array dimensions */
2476
for (idx=0; idx < ndim; idx++) {
2478
width[idx] = (Integer)GA[handle].width[idx];
2479
dims[idx] = (Integer)GA[handle].dims[idx];
2480
if (lo_loc[idx] == 0 && hi_loc[idx] == -1) return FALSE;
2483
/* Check to make sure that global array is well-behaved (all processors
2484
have data and the width of the data in each dimension is greater
2485
than the corresponding value in width[]. */
2486
if (!gai_check_ghost_distr(g_a)) return FALSE;
2488
GA_PUSH_NAME("ga_update55_ghosts");
2490
/* Get pointer to local memory */
2491
ptr_loc = GA[handle].ptr[GAme];
2492
/* obtain range of data that is held by local processor */
2493
nga_distribution_(g_a,&me,lo_loc,hi_loc);
2495
/* loop over dimensions for sequential update using shift algorithm */
2497
(*GA_Update_Signal) = 1;
2498
for (idx=0; idx < ndim; idx++) {
2499
nwidth = width[idx];
2501
/* Do not bother with update if nwidth is zero */
2504
/* Perform update in negative direction. */
2505
get_remote_block_neg(idx, ndim, lo_loc, hi_loc, slo_rem, shi_rem,
2507
/* locate processor with this data */
2508
if (!nga_locate_region_(g_a, slo_rem, shi_rem, _ga_map,
2509
GA_proclist, &np)) ga_RegionError(ga_ndim_(g_a),
2510
slo_rem, shi_rem, *g_a);
2512
/* Get actual coordinates of desired location of remote
2513
data as well as the actual coordinates of the local chunk
2514
of data that will be sent to remote processor (these
2515
coordinates take into account the presence of ghost
2516
cells). Start by finding out what data is actually held by
2517
remote processor. */
2518
proc_rem = GA_proclist[0];
2519
nga_distribution_(g_a, &proc_rem, tlo_rem, thi_rem);
2520
for (i = 0; i < ndim; i++) {
2521
if (increment[i] == 0) {
2523
plo_rem[i] = thi_rem[i] - tlo_rem[i] + width[i] + 1;
2524
phi_rem[i] = thi_rem[i] - tlo_rem[i] + 2*width[i];
2525
plo_loc[i] = width[i];
2526
phi_loc[i] = 2*width[i] - 1;
2528
plo_rem[i] = width[i];
2529
phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];
2530
plo_loc[i] = width[i];
2531
phi_loc[i] = hi_loc[i] - lo_loc[i] + width[i];
2535
phi_rem[i] = thi_rem[i] - tlo_rem[i] + increment[i];
2537
phi_loc[i] = hi_loc[i] - lo_loc[i] + increment[i];
2541
/* Get pointer to local data buffer and remote data
2542
buffer as well as lists of leading dimenstions */
2543
gam_LocationWithGhosts(me, handle, plo_loc, &ptr_loc, ld_loc);
2544
gam_LocationWithGhosts(proc_rem, handle, plo_rem, &ptr_rem, ld_rem);
2546
/* Evaluate strides on local and remote processors */
2547
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
2550
/* Compute the number of elements in each dimension and store
2551
result in count. Scale the first element in count by the
2553
gam_ComputeCount(ndim, plo_rem, phi_rem, count);
2556
/* Put local data on remote processor */
2557
if (p_handle >= 0) {
2558
proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem];
2561
ARMCI_PutS(ptr_loc, stride_loc, ptr_rem, stride_rem, count, ndim- 1, proc_rem);
2562
/* Send signal to remote processor that data transfer has been completed. */
2563
bytes = sizeof(int);
2564
ARMCI_Put(GA_Update_Signal, GA_Update_Flags[proc_rem]+msgcnt, bytes, proc_rem);
2566
ARMCI_PutS_flag(ptr_loc, stride_loc, ptr_rem, stride_rem, count,
2567
(int)(ndim - 1), GA_Update_Flags[proc_rem]+msgcnt,
2568
*GA_Update_Signal, (int)proc_rem);
2572
/* Perform update in positive direction. */
2573
get_remote_block_pos(idx, ndim, lo_loc, hi_loc, slo_rem, shi_rem,
2575
/* locate processor with this data */
2576
if (!nga_locate_region_(g_a, slo_rem, shi_rem, _ga_map,
2577
GA_proclist, &np)) ga_RegionError(ga_ndim_(g_a),
2578
slo_rem, shi_rem, *g_a);
2580
/* Get actual coordinates of desired chunk of remote
2581
data as well as the actual coordinates of the local chunk
2582
of data that will receive the remote data (these
2583
coordinates take into account the presence of ghost
2584
cells). Start by finding out what data is actually held by
2585
remote processor. */
2586
proc_rem = GA_proclist[0];
2587
nga_distribution_(g_a, &proc_rem, tlo_rem, thi_rem);
2588
for (i = 0; i < ndim; i++) {
2589
if (increment[i] == 0) {
2592
phi_rem[i] = width[i] - 1;
2593
plo_loc[i] = hi_loc[i] - lo_loc[i] + width[i] - 1;
2594
phi_loc[i] = hi_loc[i] - lo_loc[i] + 2*width[i] - 1;
2596
plo_rem[i] = width[i];
2597
phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];
2598
plo_loc[i] = width[i];
2599
phi_loc[i] = hi_loc[i] - lo_loc[i] + width[i];
2603
phi_rem[i] = thi_rem[i] - tlo_rem[i] + increment[i];
2605
phi_loc[i] = hi_loc[i] - lo_loc[i] + increment[i];
2609
/* Get pointer to local data buffer and remote data
2610
buffer as well as lists of leading dimenstions */
2611
gam_LocationWithGhosts(GAme, handle, plo_loc, &ptr_loc, ld_loc);
2612
gam_LocationWithGhosts(proc_rem, handle, plo_rem, &ptr_rem, ld_rem);
2614
/* Evaluate strides on local and remote processors */
2615
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
2618
/* Compute the number of elements in each dimension and store
2619
result in count. Scale the first element in count by the
2621
gam_ComputeCount(ndim, plo_rem, phi_rem, count);
2624
/* Put local data on remote processor */
2625
if (p_handle >= 0) {
2626
proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem];
2629
ARMCI_PutS(ptr_loc, stride_loc, ptr_rem, stride_rem, count, ndim- 1, proc_rem);
2630
/* Send signal to remote processor that data transfer has been completed. */
2631
bytes = sizeof(int);
2632
ARMCI_Put(GA_Update_Signal, GA_Update_Flags[proc_rem]+msgcnt, bytes, proc_rem);
2635
ARMCI_PutS_flag(ptr_loc, stride_loc, ptr_rem, stride_rem, count,
2636
(int)(ndim - 1), GA_Update_Flags[proc_rem]+msgcnt,
2637
*GA_Update_Signal, (int)proc_rem);
2641
/* check to make sure that all messages have been recieved before
2642
starting update along new dimension */
2643
waitforflags((GA_Update_Flags[GAme]+msgcnt-2),
2644
(GA_Update_Flags[GAme]+msgcnt-1));
2645
/* update increment array */
2646
increment[idx] = 2*nwidth;
2649
/* set GA_Update_Flags array to zero for next update operation. */
2650
for (idx=0; idx < 2*ndim; idx++) {
2651
GA_Update_Flags[GAme][idx] = 0;
2658
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY ALONG ONE SIDE OF ARRAY
2660
logical nga_update_ghost_dir_(Integer *g_a, /* GA handle */
2661
Integer *pdim, /* Dimension of update */
2662
Integer *pdir, /* Direction of update (+/-1) */
2663
logical *pflag) /* include corner cells */
2665
Integer idx, ipx, inx, np, handle=GA_OFFSET + *g_a, proc_rem;
2666
Integer ntot, mask[MAXDIM],lmask[MAXDIM];
2667
Integer size, ndim, i, itmp, idim, idir;
2668
Integer width[MAXDIM], dims[MAXDIM];
2669
Integer lo_loc[MAXDIM], hi_loc[MAXDIM];
2670
Integer plo_loc[MAXDIM], phi_loc[MAXDIM];
2671
Integer tlo_rem[MAXDIM], thi_rem[MAXDIM];
2672
Integer plo_rem[MAXDIM], phi_rem[MAXDIM];
2673
Integer ld_loc[MAXDIM], ld_rem[MAXDIM];
2675
int stride_loc[MAXDIM], stride_rem[MAXDIM],count[MAXDIM];
2676
char *ptr_loc, *ptr_rem;
2677
Integer me = ga_nodeid_();
2680
int local_sync_begin,local_sync_end;
2682
local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2683
_ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2685
/* if global array has no ghost cells, just return */
2686
if (!ga_has_ghosts_(g_a))
2689
if(local_sync_begin)ga_sync_();
2694
size = GA[handle].elemsize;
2695
ndim = GA[handle].ndim;
2696
p_handle = GA[handle].p_handle;
2697
/* initialize ghost cell widths and get array dimensions */
2698
for (idx=0; idx < ndim; idx++) {
2699
width[idx] = (Integer)GA[handle].width[idx];
2700
dims[idx] = (Integer)GA[handle].dims[idx];
2703
/* Check to make sure that global array is well-behaved (all processors
2704
have data and the width of the data in each dimension is greater than
2705
the corresponding value in width[]). */
2707
for (idx = 0; idx < ndim; idx++) {
2708
for (np = 0; np < GA[handle].nblock[idx]; np++) {
2709
if (np < GA[handle].nblock[idx] - 1) {
2710
if (GA[handle].mapc[ipx+1]-GA[handle].mapc[ipx]+1<width[idx]) {
2714
if (GA[handle].dims[idx]-GA[handle].mapc[ipx]+1<width[idx]) {
2722
GA_PUSH_NAME("nga_update_ghost_dir");
2723
/* Get pointer to local memory */
2724
ptr_loc = GA[handle].ptr[GAme];
2725
/* obtain range of data that is held by local processor */
2726
nga_distribution_(g_a,&me,lo_loc,hi_loc);
2728
/* evaluate total number of GET operations */
2731
for (idx=0; idx < ndim-1; idx++) ntot *= 3;
2734
/* Loop over all GET operations. */
2735
for (ipx=0; ipx < ntot; ipx++) {
2736
/* Convert ipx to corresponding mask values */
2739
for (idx = 0; idx < ndim-1; idx++) {
2745
for (idx = 0; idx < ndim-1; idx++) lmask[idx] = 0;
2748
for (idx = 0; idx < ndim; idx++) {
2749
if (idx == idim-1) {
2752
mask[idx] = lmask[inx];
2756
/* Now that mask has been determined, find processor that contains
2757
* data needed by the corresponding block of ghost cells */
2758
for (idx = 0; idx < ndim; idx++) {
2759
if (mask[idx] == 0) {
2760
tlo_rem[idx] = lo_loc[idx];
2761
thi_rem[idx] = hi_loc[idx];
2762
} else if (mask[idx] == -1) {
2763
if (lo_loc[idx] > 1) {
2764
tlo_rem[idx] = lo_loc[idx]-width[idx];
2765
thi_rem[idx] = lo_loc[idx]-1;
2767
tlo_rem[idx] = dims[idx]-width[idx]+1;
2768
thi_rem[idx] = dims[idx];
2770
} else if (mask[idx] == 1) {
2771
if (hi_loc[idx] < dims[idx]) {
2772
tlo_rem[idx] = hi_loc[idx] + 1;
2773
thi_rem[idx] = hi_loc[idx] + width[idx];
2776
thi_rem[idx] = width[idx];
2779
fprintf(stderr,"Illegal mask value found\n");
2782
/* Locate remote processor to which data must be sent */
2783
if (!nga_locate_region_(g_a, tlo_rem, thi_rem, _ga_map,
2784
GA_proclist, &np)) ga_RegionError(ga_ndim_(g_a),
2785
tlo_rem, thi_rem, *g_a);
2787
fprintf(stderr,"More than one remote processor found\n");
2789
/* Remote processor has been identified, now get ready to get
2790
data from it. Start by getting distribution on remote
2792
proc_rem = GA_proclist[0];
2793
nga_distribution_(g_a, &proc_rem, tlo_rem, thi_rem);
2794
for (idx = 0; idx < ndim; idx++) {
2795
if (mask[idx] == 0) {
2796
plo_loc[idx] = width[idx];
2797
phi_loc[idx] = hi_loc[idx]-lo_loc[idx]+width[idx];
2798
plo_rem[idx] = plo_loc[idx];
2799
phi_rem[idx] = phi_loc[idx];
2800
} else if (mask[idx] == -1) {
2802
phi_loc[idx] = width[idx]-1;
2803
plo_rem[idx] = thi_rem[idx]-tlo_rem[idx]+1;
2804
phi_rem[idx] = thi_rem[idx]-tlo_rem[idx]+width[idx];
2805
} else if (mask[idx] == 1) {
2806
plo_loc[idx] = hi_loc[idx]-lo_loc[idx]+width[idx]+1;
2807
phi_loc[idx] = hi_loc[idx]-lo_loc[idx]+2*width[idx];
2808
plo_rem[idx] = width[idx];
2809
phi_rem[idx] = 2*width[idx]-1;
2812
/* Get pointer to local data buffer and remote data
2813
buffer as well as lists of leading dimenstions */
2814
gam_LocationWithGhosts(me, handle, plo_loc, &ptr_loc, ld_loc);
2815
gam_LocationWithGhosts(proc_rem, handle, plo_rem, &ptr_rem, ld_rem);
2817
/* Evaluate strides on local and remote processors */
2818
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
2821
/* Compute the number of elements in each dimension and store
2822
result in count. Scale the first element in count by the
2824
gam_ComputeCount(ndim, plo_loc, phi_loc, count);
2827
/* get data from remote processor */
2828
if (p_handle >= 0) {
2829
proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem];
2831
ARMCI_GetS(ptr_rem, stride_rem, ptr_loc, stride_loc, count,
2832
(int)(ndim - 1), (int)proc_rem);
2836
if(local_sync_end)ga_sync_();
2840
/*uncomment for using message passing sendrecv in north south direction */
2841
/*#define USE_MP_NORTHSOUTH */
2844
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY USING PUT CALLS WITHOUT CORNERS AND
2845
* WITHOUT ANY BARRIERS
2847
logical ga_update5_ghosts_(Integer *g_a)
2849
Integer idx, ipx, inx, i, np, handle=GA_OFFSET + *g_a;
2850
Integer size, ndim, nwidth;
2851
Integer width[MAXDIM];
2852
Integer dims[MAXDIM];
2853
Integer* proc_rem_ptr;
2854
int *stride_loc, *stride_rem,*count;
2855
int msgcnt, bytes, corner_flag, proc_rem;
2856
char *ptr_loc, *ptr_rem,*cache;
2857
int local_sync_begin,local_sync_end;
2858
Integer me = ga_nodeid_();
2860
#ifdef USE_MP_NORTHSOUTH
2861
char send_name[32], rcv_name[32];
2862
void *snd_ptr, *rcv_ptr;
2864
/* This routine makes use of the shift algorithm to update data in the
2865
* ghost cells bounding the local block of visible data. The shift
2866
* algorithm starts by updating the blocks of data along the first
2867
* dimension by grabbing a block of data that is width[0] deep but
2868
* otherwise matches the dimensions of the data residing on the
2869
* calling processor. The update of the second dimension, however,
2870
* grabs a block that is width[1] deep in the second dimension but is
2871
* ldim0 + 2*width[0] in the first dimensions where ldim0 is the
2872
* size of the visible data along the first dimension. The remaining
2873
* dimensions are left the same. For the next update, the width of the
2874
* second dimension is also increased by 2*width[1] and so on. This
2875
* algorith makes use of the fact that data for the dimensions that
2876
* have already been updated is available on each processor and can be
2877
* used in the updates of subsequent dimensions. The total number of
2878
* separate updates is 2*ndim, an update in the negative and positive
2879
* directions for each dimension.
2881
* This operation is implemented using put calls to place the
2882
* appropriate data on remote processors. To signal the remote
2883
* processor that it has received the data, a second put call
2884
* consisting of a single integer is sent after the first put call and
2885
* used to update a signal buffer on the remote processor. Each
2886
* processor can determine how much data it has received by checking
2887
* its signal buffer.
2890
local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2891
_ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2892
if(local_sync_begin)ga_sync_();
2894
#ifdef USE_MP_NORTHSOUTH
2895
strcpy(send_name,"send_buffer");
2896
strcpy(rcv_name,"receive_buffer");
2898
snd_ptr = ga_malloc(buflen, GA[handle].type, send_name);
2899
rcv_ptr = ga_malloc(buflen, GA[handle].type, rcv_name);
2902
cache = (char *)GA[handle].cache;
2903
/* if global array has no ghost cells, just return */
2904
if (!ga_has_ghosts_(g_a)) return TRUE;
2906
size = GA[handle].elemsize;
2907
ndim = GA[handle].ndim;
2908
p_handle = GA[handle].p_handle;
2909
for (i=0; i<ndim; i++) {
2910
width[i] = (Integer)GA[handle].width[i];
2913
if (!gai_check_ghost_distr(g_a)) return FALSE;
2915
GA_PUSH_NAME("ga_update5_ghosts_");
2917
/* loop over dimensions for sequential update using shift algorithm */
2919
corner_flag = GA[handle].corner_flag;
2920
(*GA_Update_Signal) = 1;
2921
for (idx=0; idx < ndim; idx++) {
2922
nwidth = width[idx];
2925
/* Perform update in negative direction. */
2926
ptr_rem = *(char **)(cache);
2927
if(ptr_rem==NULL) return FALSE;
2928
ptr_loc = *(char **)(cache+sizeof(char *));
2929
stride_loc = (int *)(cache+2*sizeof(char *));
2930
stride_rem = (int *)(stride_loc+ndim);
2931
count = (int *)(stride_rem+ndim);
2932
proc_rem_ptr = (Integer *)(count+ndim);
2933
proc_rem = (int)(*proc_rem_ptr);
2934
cache = (char *)(proc_rem_ptr+1);
2936
if (p_handle >= 0) {
2937
proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem];
2939
if(count[0]>1000000){
2940
/*tries to use armci direct put when possible */
2941
ARMCI_PutS_flag(ptr_loc, stride_loc, ptr_rem, stride_rem, count,
2942
(int)(ndim - 1), GA_Update_Flags[proc_rem]+msgcnt,
2943
*GA_Update_Signal, proc_rem);
2946
#ifndef USE_MP_NORTHSOUTH
2947
ARMCI_PutS_flag(ptr_loc, stride_loc, ptr_rem, stride_rem, count,
2948
(int)(ndim - 1), GA_Update_Flags[proc_rem]+msgcnt,
2949
*GA_Update_Signal, proc_rem);
2956
/* Perform update in positive direction. */
2957
ptr_rem = *(char **)(cache);
2958
ptr_loc = *(char **)(cache+sizeof(char *));
2959
stride_loc = (int *)(cache+2*sizeof(char *));
2960
stride_rem = (int *)(stride_loc+ndim);
2961
count = (int *)(stride_rem+ndim);
2962
proc_rem_ptr = (Integer *)(count+ndim);
2963
proc_rem = (int)(*proc_rem_ptr);
2964
cache = (char *)(proc_rem_ptr+1);
2966
if (p_handle >= 0) {
2967
proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem];
2969
if(count[0]>1000000){
2970
/*tries to use armci direct put when possible */
2971
ARMCI_PutS_flag(ptr_loc, stride_loc, ptr_rem, stride_rem, count,
2972
(int)(ndim - 1), GA_Update_Flags[proc_rem]+msgcnt,
2973
*GA_Update_Signal, proc_rem);
2976
#ifndef USE_MP_NORTHSOUTH
2977
ARMCI_PutS_flag(ptr_loc, stride_loc, ptr_rem, stride_rem, count,
2978
(int)(ndim - 1), GA_Update_Flags[proc_rem]+msgcnt,
2979
*GA_Update_Signal, proc_rem);
2988
/* check to make sure that last two messages have been recieved
2989
before starting update along a new dimension */
2990
waitforflags((GA_Update_Flags[GAme]+msgcnt-2),
2991
(GA_Update_Flags[GAme]+msgcnt-1));
2992
GA_Update_Flags[GAme][msgcnt-1]=0;
2993
GA_Update_Flags[GAme][msgcnt-2]=0;
2999
/* check to make sure that all messages have been recieved */
3001
waitforflags((GA_Update_Flags[GAme]+msgcnt-1),
3002
(GA_Update_Flags[GAme]+msgcnt-2));
3003
GA_Update_Flags[GAme][msgcnt-1]=0;
3004
GA_Update_Flags[GAme][msgcnt-2]=0;
3013
/*#define UPDATE_SAMENODE_GHOSTS_FIRST*/
3015
logical ga_set_update5_info_(Integer *g_a)
3019
Integer size, ndim, nwidth, increment[MAXDIM],np;
3020
Integer width[MAXDIM];
3021
Integer dims[MAXDIM];
3022
Integer lo_loc[MAXDIM], hi_loc[MAXDIM];
3023
Integer plo_loc[MAXDIM], phi_loc[MAXDIM];
3024
Integer tlo_rem[MAXDIM], thi_rem[MAXDIM];
3025
Integer slo_rem[MAXDIM], shi_rem[MAXDIM];
3026
Integer plo_rem[MAXDIM], phi_rem[MAXDIM];
3027
Integer ld_loc[MAXDIM], ld_rem[MAXDIM];
3028
int *stride_loc, *stride_rem,*count;
3029
int idx, corner_flag;
3030
char **ptr_loc, **ptr_rem,*cache;
3031
Integer handle = GA_OFFSET + *g_a;
3032
int scope,cache_size;
3033
Integer me = ga_nodeid_();
3036
/* This routine sets up the arrays that are used to transfer data
3037
* using the update5 algorithm. The arrays begining with the character
3038
* "p" represent relative indices marking the location of the data set
3039
* relative to the origin the local patch of the global array, all
3040
* other indices are in absolute coordinates and mark locations in the
3041
* total global array. The indices used by this routine are described
3044
* lo_loc[], hi_loc[]: The lower and upper indices of the visible
3045
* block of data held by the calling processor.
3047
* lo_rem[], hi_rem[]: The lower and upper indices of the block
3048
* of data on a remote processor or processors that is needed to
3049
* fill in the calling processors ghost cells. These indices are
3050
* NOT corrected for wrap-around (periodic) boundary conditions
3051
* so they can be negative or greater than the array dimension
3052
* values held in dims[].
3054
* slo_rem[], shi_rem[]: Similar to lo_rem[] and hi_rem[], except
3055
* that these indices have been corrected for wrap-around
3056
* boundary conditions.
3058
* thi_rem[], thi_rem[]: The lower and upper indices of the visible
3059
* data on a remote processor.
3061
* plo_loc[], phi_loc[]: The indices of the local data patch that
3062
* is going to be updated.
3064
* plo_rem[], phi_rem[]: The indices of the data patch on the
3065
* remote processor that will be used to update the data on the
3066
* calling processor. Note that the dimensions of the patches
3067
* represented by plo_loc[], plo_rem[] and plo_loc[], phi_loc[]
3071
/* if global array has no ghost cells, just return */
3072
if (!ga_has_ghosts_(g_a)) return TRUE;
3074
/* Check to make sure that global array is well-behaved (all processors
3075
have data and the width of the data in each dimension is greater
3076
than the corresponding value in width[]. */
3077
if (!gai_check_ghost_distr(g_a)) return FALSE;
3079
ndim = GA[handle].ndim;
3080
p_handle = GA[handle].p_handle;
3081
size = GA[handle].elemsize;
3082
cache_size = 2*sizeof(char *)+3*sizeof(int)+sizeof(Integer);
3083
cache_size = 2*ndim*((cache_size/sizeof(double)) + 1);
3084
GA[handle].cache = (double *)malloc(sizeof(double)*cache_size);
3085
cache = (char *)GA[handle].cache;
3086
corner_flag = GA[handle].corner_flag;
3088
nga_distribution_(g_a,&me,lo_loc,hi_loc);
3089
for (idx=0; idx < ndim; idx++) {
3091
width[idx] = (Integer)GA[handle].width[idx];
3092
dims[idx] = (Integer)GA[handle].dims[idx];
3093
if (lo_loc[idx] == 0 && hi_loc[idx] == -1){
3094
*(char **)cache = NULL;
3098
#ifdef UPDATE_SAMENODE_GHOSTS_FIRST
3099
for(scope=0;scope < 2; scope ++)
3101
for (idx=0; idx < ndim; idx++) {
3102
nwidth = width[idx];
3105
ptr_rem = (char **)cache;
3106
ptr_loc = (char **)(cache+sizeof(char *));
3107
stride_loc = (int *)(cache+2*sizeof(char *));
3108
stride_rem = (int *)(stride_loc+ndim);
3109
count = (int *)(stride_rem+ndim);
3110
proc_rem = (Integer *)(count+ndim);
3112
get_remote_block_neg(idx, ndim, lo_loc, hi_loc, slo_rem, shi_rem,
3114
if (!nga_locate_region_(g_a, slo_rem, shi_rem, _ga_map,
3115
GA_proclist, &np)) ga_RegionError(ga_ndim_(g_a),
3116
slo_rem, shi_rem, *g_a);
3118
*proc_rem = (Integer)GA_proclist[0];
3119
if (p_handle >= 0) {
3120
*proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[*proc_rem];
3123
#ifdef UPDATE_SAMENODE_GHOSTS_FIRST
3124
if(scope == 0 && ARMCI_Same_node(*proc_rem))
3128
cache = (char *)(proc_rem+1);
3130
nga_distribution_(g_a, proc_rem, tlo_rem, thi_rem);
3133
for (i = 0; i < ndim; i++) {
3134
if (increment[i] == 0) {
3136
plo_rem[i] = thi_rem[i] - tlo_rem[i] + width[i] + 1;
3137
phi_rem[i] = thi_rem[i] - tlo_rem[i] + 2*width[i];
3138
plo_loc[i] = width[i];
3139
phi_loc[i] = 2*width[i] - 1;
3141
plo_rem[i] = width[i];
3142
phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];
3143
plo_loc[i] = width[i];
3144
phi_loc[i] = hi_loc[i] - lo_loc[i] + width[i];
3148
phi_rem[i] = thi_rem[i] - tlo_rem[i] + increment[i];
3150
phi_loc[i] = hi_loc[i] - lo_loc[i] + increment[i];
3153
gam_LocationWithGhosts(me, handle, plo_loc, ptr_loc, ld_loc);
3154
gam_LocationWithGhosts(*proc_rem, handle, plo_rem, ptr_rem, ld_rem);
3156
/* Evaluate strides on local and remote processors */
3157
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
3159
gam_ComputeCount(ndim, plo_rem, phi_rem, count);
3161
if (p_handle >= 0) {
3162
*proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[*proc_rem];
3167
/*BJP proc_rem++; */
3168
ptr_rem = (char **)cache;
3169
ptr_loc = (char **)(cache+sizeof(char *));
3170
stride_loc = (int *)(cache+2*sizeof(char *));
3171
stride_rem = (int *)(stride_loc+ndim);
3172
count = (int *)(stride_rem+ndim);
3173
proc_rem = (Integer *)(count+ndim);
3175
get_remote_block_pos(idx, ndim, lo_loc, hi_loc, slo_rem, shi_rem,
3178
if (!nga_locate_region_(g_a, slo_rem, shi_rem, _ga_map,
3179
GA_proclist, &np)) ga_RegionError(ga_ndim_(g_a),
3180
slo_rem, shi_rem, *g_a);
3182
*proc_rem = (Integer)GA_proclist[0];
3183
if (p_handle >= 0) {
3184
*proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[*proc_rem];
3187
#ifdef UPDATE_SAMENODE_GHOSTS_FIRST
3188
if(scope == 0 && ARMCI_Same_node(*proc_rem))
3192
cache = (char *)(proc_rem+1);
3194
nga_distribution_(g_a, proc_rem, tlo_rem, thi_rem);
3198
for (i = 0; i < ndim; i++) {
3199
if (increment[i] == 0) {
3202
phi_rem[i] = width[i] - 1;
3203
plo_loc[i] = hi_loc[i] - lo_loc[i] + width[i] - 1;
3204
phi_loc[i] = hi_loc[i] - lo_loc[i] + 2*width[i] - 1;
3206
plo_rem[i] = width[i];
3207
phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];
3208
plo_loc[i] = width[i];
3209
phi_loc[i] = hi_loc[i] - lo_loc[i] + width[i];
3213
phi_rem[i] = thi_rem[i] - tlo_rem[i] + increment[i];
3215
phi_loc[i] = hi_loc[i] - lo_loc[i] + increment[i];
3220
gam_LocationWithGhosts(GAme, handle, plo_loc, ptr_loc, ld_loc);
3221
gam_LocationWithGhosts(*proc_rem, handle, plo_rem, ptr_rem, ld_rem);
3223
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
3226
gam_ComputeCount(ndim, plo_rem, phi_rem, count);
3228
if (p_handle >= 0) {
3229
*proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[*proc_rem];
3233
increment[idx] = 2*nwidth;
3239
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY USING SHIFT ALGORITHM
3241
void ga_update_ghosts_(Integer *g_a)
3243
/* Wrapper program for ghost cell update operations. If optimized
3244
update operation fails then use slow but robust version of
3246
int local_sync_begin,local_sync_end;
3248
local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
3249
_ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
3250
if(local_sync_begin)ga_sync_();
3253
if (!ga_update5_ghosts_(g_a)) {
3255
if (!ga_update4_ghosts_(g_a)) {
3257
ga_update1_ghosts_(g_a);
3260
if(local_sync_end)ga_sync_();
3263
/* Utility function for ga_update6_ghosts routine */
3264
double waitformixedflags (int flag1, int flag2, int *ptr1, int *ptr2) {
3267
while ((flag1 && gai_getval(ptr1) == 0) ||
3268
(flag2 && gai_getval(ptr2) == 0)) {
3269
val = exp(-(double)i++);
3274
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY USING SHIFT ALGORITHM AND
3277
logical FATR ga_update6_ghosts_(Integer *g_a)
3279
Integer idx, idir, i, np, handle=GA_OFFSET + *g_a;
3280
Integer size, buflen, buftot, bufsize, ndim, increment[MAXDIM];
3281
Integer proc_rem_snd, proc_rem_rcv, pmax;
3282
Integer msgcnt, length, msglen;
3283
Integer width[MAXDIM], dims[MAXDIM], index[MAXDIM];
3284
Integer lo_loc[MAXDIM], hi_loc[MAXDIM];
3285
Integer plo_rem[MAXDIM], phi_rem[MAXDIM];
3286
Integer tlo_rem[MAXDIM], thi_rem[MAXDIM];
3287
Integer plo_snd[MAXDIM], phi_snd[MAXDIM];
3288
Integer lo_rcv[MAXDIM], hi_rcv[MAXDIM];
3289
Integer slo_rcv[MAXDIM], shi_rcv[MAXDIM];
3290
Integer plo_rcv[MAXDIM], phi_rcv[MAXDIM];
3291
Integer ld_loc[MAXDIM], ld_rem[MAXDIM];
3292
int stride_snd[MAXDIM], stride_rcv[MAXDIM],count[MAXDIM];
3293
int stride_rem[MAXDIM];
3294
int flag1, flag2, sprocflag, rprocflag;
3295
char *ptr_snd, *ptr_rcv;
3296
char *ptr_loc, *ptr_rem;
3297
char send_name[32], rcv_name[32];
3298
void *snd_ptr, *rcv_ptr, *snd_ptr_orig, *rcv_ptr_orig;
3299
Integer me = ga_nodeid_();
3300
Integer p_handle, wproc;
3302
/* This routine makes use of the shift algorithm to update data in the
3303
* ghost cells bounding the local block of visible data. The shift
3304
* algorithm starts by updating the blocks of data along the first
3305
* dimension by grabbing a block of data that is width[0] deep but
3306
* otherwise matches the dimensions of the data residing on the
3307
* calling processor. The update of the second dimension, however,
3308
* grabs a block that is width[1] deep in the second dimension but is
3309
* ldim0 + 2*width[0] in the first dimensions where ldim0 is the
3310
* size of the visible data along the first dimension. The remaining
3311
* dimensions are left the same. For the next update, the width of the
3312
* second dimension is also increased by 2*width[1] and so on. This
3313
* algorith makes use of the fact that data for the dimensions that
3314
* have already been updated is available on each processor and can be
3315
* used in the updates of subsequent dimensions. The total number of
3316
* separate updates is 2*ndim, an update in the negative and positive
3317
* directions for each dimension.
3319
* This implementation make use of a combination of explicit message
3320
* passing between processors on different nodes and shared memory
3321
* copies with an additional flag between processors on the same node
3322
* to perform the update. Separate message types for the messages and
3323
* the use of the additional flag are for the updates in each
3324
* coordinate direction are used to maintain synchronization locally
3325
* and to guarantee that the data is present before the updates in a
3326
* new coordinate direction take place.
3328
* To perform the update, this routine makes use of several copies of
3329
* indices marking the upper and lower limits of data. Indices
3330
* beginning with the character "p" are relative indices marking the
3331
* location of the data set relative to the origin the local patch of
3332
* the global array, all other indices are in absolute coordinates and
3333
* mark locations in the total global array. The indices used by this
3334
* routine are described below.
3336
* lo_loc[], hi_loc[]: The lower and upper indices of the visible
3337
* block of data held by the calling processor.
3339
* lo_rcv[], hi_rcv[]: The lower and upper indices of the blocks
3340
* of data that will be either sent to or received from a remote
3341
* processor. These indices are NOT corrected for wrap-around
3342
* (periodic) boundary conditions so they can be negative or greater
3343
* than the array dimension values held in dims[].
3345
* slo_rcv[], shi_rcv[]: Similar to lo_rcv[] and hi_rcv[], except
3346
* that these indices have been corrected for wrap-around
3347
* boundary conditions.
3349
* plo_rcv[], phi_rcv[]: The local indices of the local data patch
3350
* that receive that message from the remote processor.
3352
* plo_snd[], phi_snd[]: The local indices of the data patch
3353
* that will be sent to the remote processor. Note that the
3354
* dimensions of the patches represented by plo_rec[], plo_rec[] and
3355
* plo_snd[], phi_snd[] must be the same.
3357
* tlo_rem[], thi_rem[]: The indices of the locally held visible
3358
* portion of the global array on the remote processor that will be
3359
* receiving the data using a shared memory copy.
3361
* plo_rem[], phi_rem[]: The local indices of the coordinate patch
3362
* that will be put on the remote processor using a shared memory
3366
/* if global array has no ghost cells, just return */
3367
if (!ga_has_ghosts_(g_a)) return TRUE;
3369
size = GA[handle].elemsize;
3370
ndim = GA[handle].ndim;
3371
p_handle = GA[handle].p_handle;
3373
/* initialize range increments and get array dimensions */
3374
for (idx=0; idx < ndim; idx++) {
3376
width[idx] = (Integer)GA[handle].width[idx];
3377
dims[idx] = (Integer)GA[handle].dims[idx];
3380
/* Check to make sure that global array is well-behaved (all processors
3381
have data and the width of the data in each dimension is greater
3382
than the corresponding value in width[]. */
3383
if (!gai_check_ghost_distr(g_a)) return FALSE;
3385
GA_PUSH_NAME("ga_update6_ghosts");
3388
/* Get pointer to local memory */
3389
ptr_loc = GA[handle].ptr[me];
3390
/* obtain range of data that is held by local processor */
3391
nga_distribution_(g_a,&me,lo_loc,hi_loc);
3392
/* Get indices of processor in virtual grid */
3393
nga_proc_topology_(g_a, &me, index);
3395
/* Try to find maximum size of message that will be sent during
3396
* update operations and use this to allocate memory for message
3397
* passing buffers. */
3399
for (i=0; i<ndim; i++) {
3400
buftot *= (hi_loc[i]-lo_loc[i] + 1 + 2*width[i]);
3403
for (i = 0; i < ndim; i++) {
3404
idir = hi_loc[i] - lo_loc[i] + 1;
3405
if (buflen < (buftot/(idir + 2*width[i]))*width[i]) {
3406
buflen = (buftot/(idir + 2*width[i]))*width[i];
3409
bufsize = size*buflen;
3410
strcpy(send_name,"send_buffer");
3411
strcpy(rcv_name,"receive_buffer");
3412
snd_ptr_orig = snd_ptr = ga_malloc(buflen, GA[handle].type, send_name);
3413
rcv_ptr_orig = rcv_ptr = ga_malloc(buflen, GA[handle].type, rcv_name);
3415
/* loop over dimensions for sequential update using shift algorithm */
3417
(*GA_Update_Signal) = 1;
3418
for (idx=0; idx < ndim; idx++) {
3420
/* Do not bother with update if nwidth is zero */
3421
if (width[idx] != 0) {
3423
/* Perform update in negative direction. */
3424
get_remote_block_neg(idx, ndim, lo_loc, hi_loc, slo_rcv, shi_rcv,
3426
/* locate processor with this data */
3427
if (!nga_locate_region_(g_a, slo_rcv, shi_rcv, _ga_map,
3428
GA_proclist, &np)) ga_RegionError(ga_ndim_(g_a),
3429
slo_rcv, shi_rcv, *g_a);
3430
/* find out if this processor is on the same node */
3431
wproc = GA_proclist[0];
3432
if (p_handle >= 0) {
3433
wproc = PGRP_LIST[p_handle].inv_map_proc_list[wproc];
3435
rprocflag = ARMCI_Same_node(wproc);
3436
proc_rem_snd = GA_proclist[0];
3438
/* Find processor from which data will be received */
3439
for (i = 0; i < ndim; i++) {
3441
lo_rcv[i] = hi_loc[i] + 1;
3442
hi_rcv[i] = hi_loc[i] + width[i];
3444
lo_rcv[i] = lo_loc[i];
3445
hi_rcv[i] = hi_loc[i];
3449
/* Account for boundaries, if necessary. */
3450
for (i=0; i<ndim; i++) {
3452
if (hi_rcv[i] > dims[i]) {
3454
shi_rcv[i] = width[i];
3456
slo_rcv[i] = lo_rcv[i];
3457
shi_rcv[i] = hi_rcv[i];
3460
slo_rcv[i] = lo_rcv[i];
3461
shi_rcv[i] = hi_rcv[i];
3464
/* locate processor with this data */
3465
if (!nga_locate_region_(g_a, slo_rcv, shi_rcv, _ga_map,
3466
GA_proclist, &np)) ga_RegionError(ga_ndim_(g_a),
3467
slo_rcv, shi_rcv, *g_a);
3468
wproc = GA_proclist[0];
3469
if (p_handle >= 0) {
3470
wproc = PGRP_LIST[p_handle].inv_map_proc_list[wproc];
3472
sprocflag = ARMCI_Same_node(wproc);
3473
proc_rem_rcv = GA_proclist[0];
3474
nga_distribution_(g_a, &proc_rem_rcv, tlo_rem, thi_rem);
3476
/* Get actual coordinates of chunk of data that will be sent to
3477
* remote processor as well as coordinates of the array space that
3478
* will receive data from remote processor. */
3479
for (i = 0; i < ndim; i++) {
3480
if (increment[i] == 0) {
3482
plo_snd[i] = width[i];
3483
phi_snd[i] = 2*width[i] - 1;
3484
plo_rcv[i] = hi_loc[i] - lo_loc[i] + width[i] + 1;
3485
phi_rcv[i] = hi_loc[i] - lo_loc[i] + 2*width[i];
3486
plo_rem[i] = thi_rem[i] - tlo_rem[i] + width[i] + 1;
3487
phi_rem[i] = thi_rem[i] - tlo_rem[i] + 2*width[i];
3489
plo_snd[i] = width[i];
3490
phi_snd[i] = hi_loc[i] - lo_loc[i] + width[i];
3491
plo_rcv[i] = width[i];
3492
phi_rcv[i] = hi_loc[i] - lo_loc[i] + width[i];
3493
plo_rem[i] = width[i];
3494
phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];
3498
phi_snd[i] = hi_loc[i] - lo_loc[i] + increment[i];
3500
phi_rcv[i] = hi_loc[i] - lo_loc[i] + increment[i];
3502
phi_rem[i] = thi_rem[i] - tlo_rem[i] + increment[i];
3506
/* Get pointer to local data buffer and remote data
3507
buffer as well as lists of leading dimenstions */
3508
gam_LocationWithGhosts(me, handle, plo_snd, &ptr_snd, ld_loc);
3509
gam_LocationWithGhosts(me, handle, plo_rcv, &ptr_rcv, ld_loc);
3510
gam_LocationWithGhosts(proc_rem_snd, handle, plo_rem, &ptr_rem, ld_rem);
3512
/* Evaluate strides for send and receive */
3513
gam_setstride(ndim, size, ld_loc, ld_loc, stride_rcv,
3515
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
3518
/* Compute the number of elements in each dimension and store
3519
result in count. Scale the first element in count by the
3521
gam_ComputeCount(ndim, plo_rcv, phi_rcv, count);
3522
gam_CountElems(ndim, plo_snd, phi_snd, &length);
3526
/* If we are sending data to another node, then use message passing */
3528
/* Fill send buffer with data. */
3529
armci_write_strided(ptr_snd, (int)ndim-1, stride_snd, count, snd_ptr);
3532
/* Send Messages. If processor has odd index in direction idx, it
3533
* sends message first, if processor has even index it receives
3534
* message first. Then process is reversed. Also need to account
3535
* for whether or not there are an odd number of processors along
3536
* update direction. */
3538
if (p_handle >= 0) {
3539
proc_rem_snd = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem_snd];
3540
proc_rem_rcv = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem_rcv];
3542
if (GA[handle].nblock[idx]%2 == 0) {
3543
if (index[idx]%2 != 0 && !rprocflag) {
3544
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
3545
} else if (index[idx]%2 == 0 && !sprocflag) {
3546
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
3550
ARMCI_PutS(ptr_snd, stride_snd, ptr_rem, stride_rem, count, ndim- 1,
3552
/* Send signal to remote processor that data transfer has been completed. */
3553
bytes = sizeof(int);
3554
ARMCI_Put(GA_Update_Signal, GA_Update_Flags[proc_rem_snd]+msgcnt, bytes,
3557
ARMCI_PutS_flag(ptr_snd, stride_snd, ptr_rem, stride_rem, count,
3558
(int)(ndim-1), GA_Update_Flags[proc_rem_snd]+msgcnt,
3559
*GA_Update_Signal, (int)proc_rem_snd);
3562
if (index[idx]%2 != 0 && !sprocflag) {
3563
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
3564
} else if (index[idx]%2 == 0 && !rprocflag) {
3565
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
3568
/* account for wrap-around boundary condition, if necessary */
3569
pmax = GA[handle].nblock[idx] - 1;
3570
if (index[idx]%2 != 0 && !rprocflag) {
3571
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
3572
} else if (index[idx]%2 == 0 && index[idx] != pmax && !sprocflag) {
3573
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
3577
ARMCI_PutS(ptr_snd, stride_snd, ptr_rem, stride_rem, count, ndim- 1,
3579
/* Send signal to remote processor that data transfer has been completed. */
3580
bytes = sizeof(int);
3581
ARMCI_Put(GA_Update_Signal, GA_Update_Flags[proc_rem_snd]+msgcnt, bytes,
3584
ARMCI_PutS_flag(ptr_snd, stride_snd, ptr_rem, stride_rem, count,
3585
(int)(ndim-1), GA_Update_Flags[proc_rem_snd]+msgcnt,
3586
*GA_Update_Signal, (int)proc_rem_snd);
3589
if (index[idx]%2 != 0 && !sprocflag) {
3590
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
3591
} else if (index[idx]%2 == 0 && index[idx] != 0 && !rprocflag) {
3592
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
3594
/* make up for odd processor at end of string */
3595
if (index[idx] == 0 && !rprocflag) {
3596
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
3598
if (index[idx] == pmax && !sprocflag) {
3599
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
3608
/* copy data back into global array */
3610
armci_read_strided(ptr_rcv, (int)ndim-1, stride_rcv, count, rcv_ptr);
3613
/* Find parameters for message in positive direction. */
3614
get_remote_block_pos(idx, ndim, lo_loc, hi_loc, slo_rcv, shi_rcv,
3616
/* locate processor with this data */
3617
if (!nga_locate_region_(g_a, slo_rcv, shi_rcv, _ga_map,
3618
GA_proclist, &np)) ga_RegionError(ga_ndim_(g_a),
3619
slo_rcv, shi_rcv, *g_a);
3620
wproc = GA_proclist[0];
3621
if (p_handle >= 0) {
3622
wproc = PGRP_LIST[p_handle].inv_map_proc_list[wproc];
3624
rprocflag = ARMCI_Same_node(wproc);
3625
proc_rem_snd = GA_proclist[0];
3627
/* Find processor from which data will be recieved */
3628
for (i = 0; i < ndim; i++) {
3630
lo_rcv[i] = lo_loc[i] - width[i];
3631
hi_rcv[i] = lo_loc[i] - 1;
3633
lo_rcv[i] = lo_loc[i];
3634
hi_rcv[i] = hi_loc[i];
3638
/* Account for boundaries, if necessary. */
3639
for (i=0; i<ndim; i++) {
3641
if (hi_rcv[i] < 1) {
3642
slo_rcv[i] = dims[i] - width[i] + 1;
3643
shi_rcv[i] = dims[i];
3645
slo_rcv[i] = lo_rcv[i];
3646
shi_rcv[i] = hi_rcv[i];
3649
slo_rcv[i] = lo_rcv[i];
3650
shi_rcv[i] = hi_rcv[i];
3653
/* locate processor with this data */
3654
if (!nga_locate_region_(g_a, slo_rcv, shi_rcv, _ga_map,
3655
GA_proclist, &np)) ga_RegionError(ga_ndim_(g_a),
3656
slo_rcv, shi_rcv, *g_a);
3657
wproc = GA_proclist[0];
3658
if (p_handle >= 0) {
3659
wproc = PGRP_LIST[p_handle].inv_map_proc_list[wproc];
3661
sprocflag = ARMCI_Same_node(wproc);
3662
proc_rem_rcv = GA_proclist[0];
3663
nga_distribution_(g_a, &proc_rem_rcv, tlo_rem, thi_rem);
3665
/* Get actual coordinates of chunk of data that will be sent to
3666
* remote processor as well as coordinates of the array space that
3667
* will receive data from remote processor. */
3668
for (i = 0; i < ndim; i++) {
3669
if (increment[i] == 0) {
3671
plo_snd[i] = hi_loc[i] - lo_loc[i] + 1;
3672
phi_snd[i] = hi_loc[i] - lo_loc[i] + width[i];
3674
phi_rcv[i] = width[i] - 1;
3676
phi_rem[i] = width[i] - 1;
3678
plo_snd[i] = width[i];
3679
phi_snd[i] = hi_loc[i] - lo_loc[i] + width[i];
3680
plo_rcv[i] = width[i];
3681
phi_rcv[i] = hi_loc[i] - lo_loc[i] + width[i];
3682
plo_rem[i] = width[i];
3683
phi_rem[i] = thi_rem[i] - tlo_rem[i] + width[i];
3687
phi_snd[i] = hi_loc[i] - lo_loc[i] + increment[i];
3689
phi_rcv[i] = hi_loc[i] - lo_loc[i] + increment[i];
3691
phi_rem[i] = thi_rem[i] - tlo_rem[i] + increment[i];
3695
/* Get pointer to local data buffer and remote data
3696
buffer as well as lists of leading dimenstions */
3697
gam_LocationWithGhosts(me, handle, plo_snd, &ptr_snd, ld_loc);
3698
gam_LocationWithGhosts(me, handle, plo_rcv, &ptr_rcv, ld_loc);
3699
gam_LocationWithGhosts(proc_rem_snd, handle, plo_rem, &ptr_rem, ld_rem);
3701
/* Evaluate strides for send and recieve */
3702
gam_setstride(ndim, size, ld_loc, ld_loc, stride_rcv,
3704
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
3707
/* Compute the number of elements in each dimension and store
3708
result in count. Scale the first element in count by the
3710
gam_ComputeCount(ndim, plo_rcv, phi_rcv, count);
3711
gam_CountElems(ndim, plo_snd, phi_snd, &length);
3715
/* if we are sending data to another node, use message passing */
3717
/* Fill send buffer with data. */
3718
armci_write_strided(ptr_snd, (int)ndim-1, stride_snd, count, snd_ptr);
3721
/* Send Messages. If processor has odd index in direction idx, it
3722
* sends message first, if processor has even index it receives
3723
* message first. Then process is reversed. Also need to account
3724
* for whether or not there are an odd number of processors along
3725
* update direction. */
3727
if (p_handle >= 0) {
3728
proc_rem_snd = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem_snd];
3729
proc_rem_rcv = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem_rcv];
3731
if (GA[handle].nblock[idx]%2 == 0) {
3732
if (index[idx]%2 != 0 && !rprocflag) {
3733
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
3734
} else if (index[idx]%2 == 0 && !sprocflag) {
3735
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
3739
ARMCI_PutS(ptr_snd, stride_snd, ptr_rem, stride_rem, count, ndim- 1,
3741
/* Send signal to remote processor that data transfer has been completed. */
3742
bytes = sizeof(int);
3743
ARMCI_Put(GA_Update_Signal, GA_Update_Flags[proc_rem_snd]+msgcnt, bytes,
3746
ARMCI_PutS_flag(ptr_snd, stride_snd, ptr_rem, stride_rem, count,
3747
(int)(ndim-1), GA_Update_Flags[proc_rem_snd]+msgcnt,
3748
*GA_Update_Signal, (int)proc_rem_snd);
3751
if (index[idx]%2 != 0 && !sprocflag) {
3752
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
3753
} else if (index[idx]%2 == 0 && !rprocflag) {
3754
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
3757
/* account for wrap-around boundary condition, if necessary */
3758
pmax = GA[handle].nblock[idx] - 1;
3759
if (index[idx]%2 != 0 && !rprocflag) {
3760
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
3761
} else if (index[idx]%2 == 0 && index[idx] != 0 && !sprocflag) {
3762
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
3766
ARMCI_PutS(ptr_snd, stride_snd, ptr_rem, stride_rem, count, ndim- 1,
3768
/* Send signal to remote processor that data transfer has been completed. */
3769
bytes = sizeof(int);
3770
ARMCI_Put(GA_Update_Signal, GA_Update_Flags[proc_rem_snd]+msgcnt, bytes,
3773
ARMCI_PutS_flag(ptr_snd, stride_snd, ptr_rem, stride_rem, count,
3774
(int)(ndim-1), GA_Update_Flags[proc_rem_snd]+msgcnt,
3775
*GA_Update_Signal, (int)proc_rem_snd);
3778
if (index[idx]%2 != 0 && !sprocflag) {
3779
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
3780
} else if (index[idx]%2 == 0 && index[idx] != pmax && !rprocflag) {
3781
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
3783
/* make up for odd processor at end of string */
3784
if (index[idx] == pmax && !rprocflag) {
3785
armci_msg_snd(msgcnt, snd_ptr, length, proc_rem_snd);
3787
if (index[idx] == 0 && !sprocflag) {
3788
armci_msg_rcv(msgcnt, rcv_ptr, bufsize, &msglen, proc_rem_rcv);
3791
/* copy data back into global array */
3793
armci_read_strided(ptr_rcv, (int)ndim-1, stride_rcv, count, rcv_ptr);
3802
/* check to make sure any outstanding puts have showed up */
3803
waitformixedflags(flag1, flag2, GA_Update_Flags[GAme]+msgcnt-2,
3804
GA_Update_Flags[GAme]+msgcnt-1);
3805
/* update increment array */
3806
increment[idx] = 2*width[idx];
3809
ga_free(rcv_ptr_orig);
3810
ga_free(snd_ptr_orig);
3811
/* set update flags to zero for next operation */
3812
for (idx=0; idx < 2*ndim; idx++) {
3813
GA_Update_Flags[GAme][idx] = 0;
3820
/*\ UPDATE GHOST CELLS OF GLOBAL ARRAY USING GET CALLS
3822
logical FATR ga_update7_ghosts_(Integer *g_a)
3824
Integer idx, ipx, np, handle=GA_OFFSET + *g_a, proc_rem;
3825
Integer ntot, mask[MAXDIM];
3826
Integer size, ndim, i, itmp;
3827
Integer width[MAXDIM], dims[MAXDIM];
3828
Integer lo_loc[MAXDIM], hi_loc[MAXDIM];
3829
Integer plo_loc[MAXDIM], phi_loc[MAXDIM];
3830
Integer tlo_rem[MAXDIM], thi_rem[MAXDIM];
3831
Integer plo_rem[MAXDIM];
3832
Integer ld_loc[MAXDIM], ld_rem[MAXDIM];
3834
int stride_loc[MAXDIM], stride_rem[MAXDIM],count[MAXDIM];
3835
char *ptr_loc, *ptr_rem;
3836
Integer me = ga_nodeid_();
3839
/* if global array has no ghost cells, just return */
3840
if (!ga_has_ghosts_(g_a)) {
3844
size = GA[handle].elemsize;
3845
ndim = GA[handle].ndim;
3846
p_handle = GA[handle].p_handle;
3847
/* initialize ghost cell widths and get array dimensions */
3848
for (idx=0; idx < ndim; idx++) {
3849
width[idx] = (Integer)GA[handle].width[idx];
3850
dims[idx] = (Integer)GA[handle].dims[idx];
3853
/* Check to make sure that global array is well-behaved (all processors
3854
have data and the width of the data in each dimension is greater than
3855
the corresponding value in width[]). */
3856
if (!gai_check_ghost_distr(g_a)) return FALSE;
3858
GA_PUSH_NAME("ga_update7_ghosts");
3859
/* Get pointer to local memory */
3860
ptr_loc = GA[handle].ptr[me];
3861
/* obtain range of data that is held by local processor */
3862
nga_distribution_(g_a,&me,lo_loc,hi_loc);
3864
/* evaluate total number of GET operations that will be required */
3866
for (idx=0; idx < ndim; idx++) ntot *= 3;
3868
/* Loop over all GET operations. The operation corresponding to the
3869
mask of all zeros is left out. */
3870
for (ipx=0; ipx < ntot; ipx++) {
3871
/* Convert ipx to corresponding mask values */
3874
for (idx = 0; idx < ndim; idx++) {
3877
if (mask[idx] != 0) mask0 = FALSE;
3880
if (mask0) continue;
3882
/* check to see if ghost cell block has zero elements*/
3885
for (idx = 0; idx < ndim; idx++) {
3886
if (mask[idx] != 0 && width[idx] == 0) mask0 = TRUE;
3887
if (mask[idx] != 0) itmp++;
3889
/*if (itmp>1) mask0 = TRUE; */
3890
if (mask0) continue;
3891
/* Now that mask has been determined, find data that is to be moved
3892
* and identify processor from which it is coming. Wrap boundaries
3893
* around, if necessary */
3894
for (idx = 0; idx < ndim; idx++) {
3895
if (mask[idx] == 0) {
3896
tlo_rem[idx] = lo_loc[idx];
3897
thi_rem[idx] = hi_loc[idx];
3898
} else if (mask[idx] == -1) {
3899
if (lo_loc[idx] > 1) {
3900
tlo_rem[idx] = lo_loc[idx]-width[idx];
3901
thi_rem[idx] = lo_loc[idx]-1;
3903
tlo_rem[idx] = dims[idx]-width[idx]+1;
3904
thi_rem[idx] = dims[idx];
3906
} else if (mask[idx] == 1) {
3907
if (hi_loc[idx] < dims[idx]) {
3908
tlo_rem[idx] = hi_loc[idx] + 1;
3909
thi_rem[idx] = hi_loc[idx] + width[idx];
3912
thi_rem[idx] = width[idx];
3915
fprintf(stderr,"Illegal mask value found\n");
3918
/* Locate remote processor to which data must be sent */
3919
if (!nga_locate_region_(g_a, tlo_rem, thi_rem, _ga_map,
3920
GA_proclist, &np)) ga_RegionError(ga_ndim_(g_a),
3921
tlo_rem, thi_rem, *g_a);
3923
fprintf(stderr,"More than one remote processor found\n");
3925
/* Remote processor has been identified, now get ready to send
3926
data to it. Start by getting distribution on remote
3928
proc_rem = GA_proclist[0];
3929
nga_distribution_(g_a, &proc_rem, tlo_rem, thi_rem);
3930
for (idx = 0; idx < ndim; idx++) {
3931
if (mask[idx] == 0) {
3932
plo_loc[idx] = width[idx];
3933
phi_loc[idx] = hi_loc[idx]-lo_loc[idx]+width[idx];
3934
plo_rem[idx] = plo_loc[idx];
3935
} else if (mask[idx] == -1) {
3937
phi_loc[idx] = width[idx]-1;
3938
plo_rem[idx] = thi_rem[idx]-tlo_rem[idx]+1;
3939
} else if (mask[idx] == 1) {
3940
plo_loc[idx] = hi_loc[idx]-lo_loc[idx]+width[idx]+1;
3941
phi_loc[idx] = hi_loc[idx]-lo_loc[idx]+2*width[idx];
3942
plo_rem[idx] = width[idx];
3945
/* Get pointer to local data buffer and remote data
3946
buffer as well as lists of leading dimenstions */
3947
gam_LocationWithGhosts(me, handle, plo_loc, &ptr_loc, ld_loc);
3948
gam_LocationWithGhosts(proc_rem, handle, plo_rem, &ptr_rem, ld_rem);
3950
/* Evaluate strides on local and remote processors */
3951
gam_setstride(ndim, size, ld_loc, ld_rem, stride_rem,
3954
/* Compute the number of elements in each dimension and store
3955
result in count. Scale the first element in count by the
3957
gam_ComputeCount(ndim, plo_loc, phi_loc, count);
3960
if (p_handle >= 0) {
3961
proc_rem = PGRP_LIST[p_handle].inv_map_proc_list[proc_rem];
3963
/* put data on remote processor */
3964
/* ARMCI_GetS(ptr_rem, stride_rem, ptr_loc, stride_loc, count,
3965
(int)(ndim - 1), (int)proc_rem); */
3966
ARMCI_NbGetS(ptr_rem, stride_rem, ptr_loc, stride_loc, count,
3967
(int)(ndim - 1), (int)proc_rem, NULL);
3975
void FATR ga_ghost_barrier_()
3978
int signal = 1, n = 1;
3981
armci_msg_igop(ptr,n,"+");
3983
armci_msg_barrier();
3986
/*\ UPDATE THE GHOST CELLS ON A PROCESSOR IN A SPECIFIC DIRECTION
3987
* USING NON-BLOCKING GET CALLS
3989
void FATR nga_nbget_ghost_dir_(Integer *g_a,
3993
Integer handle = GA_OFFSET + *g_a;
3994
Integer lo_loc[MAXDIM], hi_loc[MAXDIM], lo_rem[MAXDIM], hi_rem[MAXDIM];
3995
Integer subscript[MAXDIM], ld[MAXDIM];
3996
Integer i, ndim, dim, width;
3998
Integer me = ga_nodeid_();
4000
GA_PUSH_NAME("nga_nbget_ghost_dir");
4001
ndim = GA[handle].ndim;
4002
p_handle = GA[handle].p_handle;
4003
/* check mask to see that it corresponds to a valid direction */
4004
for (i=0; i<ndim; i++) {
4005
if (abs(mask[i]) != 0 && abs(mask[i]) != 1)
4006
ga_error("nga_nbget_ghost_dir: invalid mask entry", mask[i]);
4009
/* get range of data on local processor */
4010
nga_distribution_(g_a,&me,lo_loc,hi_loc);
4012
/* locate data on remote processor */
4013
for (i=0; i<ndim; i++) {
4014
dim = (Integer)GA[handle].dims[i];
4015
width = (Integer)GA[handle].width[i];
4017
if (hi_loc[i] == dim) {
4021
lo_rem[i] = hi_loc[i]+1;
4022
hi_rem[i] = hi_loc[i]+width;
4024
} else if (mask[i] == -1) {
4025
if (lo_loc[i] == 1) {
4026
lo_rem[i] = dim - width + 1;
4029
lo_rem[i] = lo_loc[i] - width;
4030
hi_rem[i] = lo_loc[i] - 1;
4033
lo_rem[i] = lo_loc[i];
4034
hi_rem[i] = hi_loc[i];
4038
/* Get pointer to data destination on local block. Start by
4039
by finding subscript to origin of destination block */
4040
for (i=0; i<ndim; i++) {
4042
subscript[i] = hi_loc[i]-lo_loc[i]+1+GA[handle].width[i];
4043
} else if (mask[i] == -1) {
4046
subscript[i] = GA[handle].width[i];
4048
ld[i] = hi_loc[i]-lo_loc[i]+1+2*GA[handle].width[i];
4050
gam_LocationWithGhosts(me, handle, subscript, &ptr_loc, ld);
4052
nga_get_common(g_a,lo_rem,hi_rem,ptr_loc,ld,nbhandle);
4056
/*\ SET PRECOMPUTED INFO FOR UPDATING GHOST CELLS
4058
logical ga_set_ghost_info_(Integer *g_a)
4060
Integer handle = *g_a + GA_OFFSET;
4061
if (GA[handle].cache != NULL)
4062
free(GA[handle].cache);
4063
GA[handle].cache = NULL;
4064
if (GA[handle].actv == 1) {
4066
return ga_set_update5_info_(g_a);
4068
return ga_set_update4_info_(g_a);
4074
/*\ SET FLAG ON WHETHER OR NOT TO UPDATE GHOST CELL CORNER DATA
4076
void ga_set_ghost_corner_flag_(Integer *g_a, logical *flag)
4078
Integer handle = *g_a + GA_OFFSET;
4079
GA[handle].corner_flag = (int)*flag;
4080
if (GA[handle].actv == 1) {
4081
(void)ga_set_ghost_info_(g_a);