6
* module: global.npatch.c
8
* description: Implements the n-dimensional patch operations:
17
* This material was prepared as an account of work sponsored by an
18
* agency of the United States Government. Neither the United States
19
* Government nor the United States Department of Energy, nor Battelle,
20
* nor any of their employees, MAKES ANY WARRANTY, EXPRESS OR IMPLIED, OR
21
* ASSUMES ANY LEGAL LIABILITY OR RESPONSIBILITY FOR THE ACCURACY,
22
* COMPLETENESS, OR USEFULNESS OF ANY INFORMATION, APPARATUS, PRODUCT,
23
* SOFTWARE, OR PROCESS DISCLOSED, OR REPRESENTS THAT ITS USE WOULD NOT
24
* INFRINGE PRIVATELY OWNED RIGHTS.
29
* This software and its documentation were produced with United States
30
* Government support under Contract Number DE-AC06-76RLO-1830 awarded by
31
* the United States Department of Energy. The United States Government
32
* retains a paid-up non-exclusive, irrevocable worldwide license to
33
* reproduce, prepare derivative works, perform publicly and display
34
* publicly by or for the US Government, including the right to
35
* distribute to other US Government contractors.
49
extern ARMCI_Group* ga_get_armci_group_(int);
52
/**********************************************************
53
* n-dimensional utilities *
54
**********************************************************/
56
/*\ compute Index from subscript and convert it back to subscript
59
static void snga_dest_indices(Integer ndims, Integer *los, Integer *blos, Integer *dimss,
60
Integer ndimd, Integer *lod, Integer *blod, Integer *dimsd)
62
Integer idx = 0, i, factor=1;
64
for(i=0;i<ndims;i++) {
65
idx += (los[i] - blos[i])*factor;
69
for(i=0;i<ndims;i++) {
70
lod[i] = idx % dimsd[i] + blod[i];
76
/* check if I own data in the patch */
77
logical pnga_patch_intersect(Integer *lo, Integer *hi,
78
Integer *lop, Integer *hip, Integer ndim)
82
/* check consistency of patch coordinates */
83
for(i=0; i<ndim; i++) {
84
if(hi[i] < lo[i]) return FALSE; /* inconsistent */
85
if(hip[i] < lop[i]) return FALSE; /* inconsistent */
88
/* find the intersection and update (ilop: ihip, jlop: jhip) */
89
for(i=0; i<ndim; i++) {
90
if(hi[i] < lop[i]) return FALSE; /* don't intersect */
91
if(hip[i] < lo[i]) return FALSE; /* don't intersect */
94
for(i=0; i<ndim; i++) {
95
lop[i] = GA_MAX(lo[i], lop[i]);
96
hip[i] = GA_MIN(hi[i], hip[i]);
103
/*\ check if patches are identical
105
logical pnga_comp_patch(Integer andim, Integer *alo, Integer *ahi,
106
Integer bndim, Integer *blo, Integer *bhi)
113
for(i=ndim; i<andim; i++)
114
if(alo[i] != ahi[i]) return FALSE;
116
else if(andim < bndim) {
118
for(i=ndim; i<bndim; i++)
119
if(blo[i] != bhi[i]) return FALSE;
123
for(i=0; i<ndim; i++)
124
if((alo[i] != blo[i]) || (ahi[i] != bhi[i])) return FALSE;
130
/* test two GAs to see if they have the same shape */
131
static logical snga_test_shape(Integer *alo, Integer *ahi, Integer *blo,
132
Integer *bhi, Integer andim, Integer bndim)
136
if(andim != bndim) return FALSE;
138
for(i=0; i<andim; i++)
139
if((ahi[i] - alo[i]) != (bhi[i] - blo[i])) return FALSE;
145
/**********************************************************
146
* n-dimensional functions *
147
**********************************************************/
150
/*\ COPY A PATCH AND POSSIBLY RESHAPE
152
* . the element capacities of two patches must be identical
153
* . copy by column order - Fortran convention
155
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
156
# pragma weak wnga_copy_patch = pnga_copy_patch
158
void pnga_copy_patch(char *trans,
159
Integer g_a, Integer *alo, Integer *ahi,
160
Integer g_b, Integer *blo, Integer *bhi)
164
Integer atype, btype, andim, adims[MAXDIM], bndim, bdims[MAXDIM];
166
Integer atotal, btotal;
167
Integer los[MAXDIM], his[MAXDIM];
168
Integer lod[MAXDIM], hid[MAXDIM];
169
Integer ld[MAXDIM], ald[MAXDIM], bld[MAXDIM];
170
void *src_data_ptr, *tmp_ptr;
171
Integer *src_idx_ptr, *dst_idx_ptr;
172
Integer bvalue[MAXDIM], bunit[MAXDIM];
173
Integer factor_idx1[MAXDIM], factor_idx2[MAXDIM], factor_data[MAXDIM];
176
Integer a_grp, b_grp, anproc, bnproc;
177
Integer num_blocks_a, num_blocks_b/*, chk*/;
178
int use_put, has_intersection;
179
int local_sync_begin,local_sync_end;
181
local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
182
_ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
183
a_grp = pnga_get_pgroup(g_a);
184
b_grp = pnga_get_pgroup(g_b);
185
me_a = pnga_pgroup_nodeid(a_grp);
186
me_b = pnga_pgroup_nodeid(b_grp);
187
anproc = pnga_get_pgroup_size(a_grp);
188
bnproc = pnga_get_pgroup_size(b_grp);
189
if (anproc <= bnproc) {
195
/*if (a_grp != b_grp)
196
pnga_error("All matrices must be on same group for pnga_copy_patch", 0L); */
197
if(local_sync_begin) {
198
if (anproc <= bnproc) {
199
pnga_pgroup_sync(a_grp);
200
} else if (a_grp == pnga_pgroup_get_world() &&
201
b_grp == pnga_pgroup_get_world()) {
204
pnga_pgroup_sync(b_grp);
208
GA_PUSH_NAME("pnga_copy_patch");
210
pnga_inquire(g_a, &atype, &andim, adims);
211
pnga_inquire(g_b, &btype, &bndim, bdims);
214
/* they are the same patch */
215
if(pnga_comp_patch(andim, alo, ahi, bndim, blo, bhi)) {
217
/* they are in the same GA, but not the same patch */
218
} else if (pnga_patch_intersect(alo, ahi, blo, bhi, andim)) {
219
pnga_error("array patches cannot overlap ", 0L);
223
if(atype != btype ) pnga_error("array type mismatch ", 0L);
225
/* check if patch indices and dims match */
226
for(i=0; i<andim; i++)
227
if(alo[i] <= 0 || ahi[i] > adims[i])
228
pnga_error("g_a indices out of range ", 0L);
229
for(i=0; i<bndim; i++)
230
if(blo[i] <= 0 || bhi[i] > bdims[i])
231
pnga_error("g_b indices out of range ", 0L);
233
/* check if numbers of elements in two patches match each other */
234
atotal = 1; btotal = 1;
235
for(i=0; i<andim; i++) atotal *= (ahi[i] - alo[i] + 1);
236
for(i=0; i<bndim; i++) btotal *= (bhi[i] - blo[i] + 1);
238
pnga_error("capacities two of patches do not match ", 0L);
240
/* additional restrictions that apply if one or both arrays use
241
block-cyclic data distributions */
242
num_blocks_a = pnga_total_blocks(g_a);
243
num_blocks_b = pnga_total_blocks(g_b);
244
if (num_blocks_a >= 0 || num_blocks_b >= 0) {
245
if (!(*trans == 'n' || *trans == 'N')) {
246
pnga_error("Transpose option not supported for block-cyclic data", 0L);
248
if (!snga_test_shape(alo, ahi, blo, bhi, andim, bndim)) {
249
pnga_error("Change in shape not supported for block-cyclic data", 0L);
253
if (num_blocks_a < 0 && num_blocks_b <0) {
254
/* now find out cordinates of a patch of g_a that I own */
256
pnga_distribution(g_a, me_a, los, his);
258
pnga_distribution(g_b, me_b, los, his);
261
/* copy my share of data */
263
has_intersection = pnga_patch_intersect(alo, ahi, los, his, andim);
265
has_intersection = pnga_patch_intersect(blo, bhi, los, his, bndim);
267
if(has_intersection){
269
pnga_access_ptr(g_a, los, his, &src_data_ptr, ld);
271
pnga_access_ptr(g_b, los, his, &src_data_ptr, ld);
274
/* calculate the number of elements in the patch that I own */
275
nelem = 1; for(i=0; i<andim; i++) nelem *= (his[i] - los[i] + 1);
277
for(i=0; i<andim; i++) ald[i] = ahi[i] - alo[i] + 1;
278
for(i=0; i<bndim; i++) bld[i] = bhi[i] - blo[i] + 1;
280
base = 0; factor = 1;
281
for(i=0; i<andim; i++) {
282
base += los[i] * factor;
286
/*** straight copy possible if there's no reshaping or transpose ***/
287
if((*trans == 'n' || *trans == 'N') &&
288
snga_test_shape(alo, ahi, blo, bhi, andim, bndim)) {
289
/* find source[lo:hi] --> destination[lo:hi] */
291
snga_dest_indices(andim, los, alo, ald, bndim, lod, blo, bld);
292
snga_dest_indices(andim, his, alo, ald, bndim, hid, blo, bld);
293
pnga_put(g_b, lod, hid, src_data_ptr, ld);
294
pnga_release(g_a, los, his);
296
snga_dest_indices(bndim, los, blo, bld, andim, lod, alo, ald);
297
snga_dest_indices(bndim, his, blo, bld, andim, hid, alo, ald);
298
pnga_get(g_a, lod, hid, src_data_ptr, ld);
299
pnga_release(g_b, los, his);
301
/*** due to generality of this transformation scatter is required ***/
304
tmp_ptr = ga_malloc(nelem, atype, "v");
305
src_idx_ptr = (Integer*) ga_malloc((andim*nelem), MT_F_INT, "si");
306
dst_idx_ptr = (Integer*) ga_malloc((bndim*nelem), MT_F_INT, "di");
308
/* calculate the destination indices */
310
/* given los and his, find indices for each elements
311
* bvalue: starting index in each dimension
312
* bunit: stride in each dimension
314
for (i=0; i<andim; i++) {
316
if (i == 0) bunit[i] = 1;
317
else bunit[i] = bunit[i-1] * (his[i-1] - los[i-1] + 1);
321
for (i=0; i<nelem; i++) {
322
for (j=0; j<andim; j++){
323
src_idx_ptr[i*andim+j] = bvalue[j];
324
/* if the next element is the first element in
325
* one dimension, increment the index by 1
327
if (((i+1) % bunit[j]) == 0) bvalue[j]++;
328
/* if the index becomes larger than the upper
329
* bound in one dimension, reset it.
331
if(bvalue[j] > his[j]) bvalue[j] = los[j];
335
/* index factor: reshaping without transpose */
337
for (j=1; j<andim; j++)
338
factor_idx1[j] = factor_idx1[j-1] * ald[j-1];
340
/* index factor: reshaping with transpose */
341
factor_idx2[andim-1] = 1;
342
for (j=(andim-1)-1; j>=0; j--)
343
factor_idx2[j] = factor_idx2[j+1] * ald[j+1];
347
for (j=1; j<andim; j++)
348
factor_data[j] = factor_data[j-1] * ld[j-1];
350
/* destination indices */
351
for(i=0; i<nelem; i++) {
352
/* linearize the n-dimensional indices to one dimension */
354
if (*trans == 'n' || *trans == 'N')
355
for (j=0; j<andim; j++)
356
idx += (src_idx_ptr[i*andim+j] - alo[j]) *
359
/* if the patch needs to be transposed, reverse
360
* the indices: (i, j, ...) -> (..., j, i)
362
for (j=(andim-1); j>=0; j--)
363
idx += (src_idx_ptr[i*andim+j] - alo[j]) *
366
/* convert the one dimensional index to n-dimensional
367
* indices of destination
369
for (j=0; j<bndim; j++) {
370
dst_idx_ptr[i*bndim+j] = idx % bld[j] + blo[j];
374
/* move the data block to create a new block */
375
/* linearize the data indices */
377
for (j=0; j<andim; j++)
378
idx += (src_idx_ptr[i*andim+j]) * factor_data[j];
380
/* adjust the position
381
* base: starting address of the first element */
384
/* move the element to the temporary location */
386
case C_DBL: ((double*)tmp_ptr)[i] =
387
((double*)src_data_ptr)[idx];
390
((int *)tmp_ptr)[i] = ((int *)src_data_ptr)[idx];
392
case C_DCPL:((DoubleComplex *)tmp_ptr)[i] =
393
((DoubleComplex *)src_data_ptr)[idx];
395
case C_SCPL:((SingleComplex *)tmp_ptr)[i] =
396
((SingleComplex *)src_data_ptr)[idx];
398
case C_FLOAT: ((float *)tmp_ptr)[i] =
399
((float *)src_data_ptr)[idx];
401
case C_LONG: ((long *)tmp_ptr)[i] =
402
((long *)src_data_ptr)[idx];
404
case C_LONGLONG: ((long long *)tmp_ptr)[i] =
405
((long long *)src_data_ptr)[idx];
408
pnga_release(g_a, los, his);
409
pnga_scatter(g_b, tmp_ptr, dst_idx_ptr, 0, nelem);
410
ga_free(dst_idx_ptr);
411
ga_free(src_idx_ptr);
414
tmp_ptr = ga_malloc(nelem, atype, "v");
415
src_idx_ptr = (Integer*) ga_malloc((bndim*nelem), MT_F_INT, "si");
416
dst_idx_ptr = (Integer*) ga_malloc((andim*nelem), MT_F_INT, "di");
418
/* calculate the destination indices */
420
/* given los and his, find indices for each elements
421
* bvalue: starting index in each dimension
422
* bunit: stride in each dimension
424
for (i=0; i<andim; i++) {
426
if (i == 0) bunit[i] = 1;
427
else bunit[i] = bunit[i-1] * (his[i-1] - los[i-1] + 1);
430
/* destination indices */
431
for (i=0; i<nelem; i++) {
432
for (j=0; j<bndim; j++){
433
src_idx_ptr[i*bndim+j] = bvalue[j];
434
/* if the next element is the first element in
435
* one dimension, increment the index by 1
437
if (((i+1) % bunit[j]) == 0) bvalue[j]++;
438
/* if the index becomes larger than the upper
439
* bound in one dimension, reset it.
441
if(bvalue[j] > his[j]) bvalue[j] = los[j];
445
/* index factor: reshaping without transpose */
447
for (j=1; j<bndim; j++)
448
factor_idx1[j] = factor_idx1[j-1] * bld[j-1];
450
/* index factor: reshaping with transpose */
451
factor_idx2[bndim-1] = 1;
452
for (j=(bndim-1)-1; j>=0; j--)
453
factor_idx2[j] = factor_idx2[j+1] * bld[j+1];
457
for (j=1; j<bndim; j++)
458
factor_data[j] = factor_data[j-1] * ld[j-1];
460
/* destination indices */
461
for(i=0; i<nelem; i++) {
462
/* linearize the n-dimensional indices to one dimension */
464
if (*trans == 'n' || *trans == 'N')
465
for (j=0; j<andim; j++)
466
idx += (src_idx_ptr[i*bndim+j] - blo[j]) *
469
/* if the patch needs to be transposed, reverse
470
* the indices: (i, j, ...) -> (..., j, i)
472
for (j=(andim-1); j>=0; j--)
473
idx += (src_idx_ptr[i*bndim+j] - blo[j]) *
476
/* convert the one dimensional index to n-dimensional
477
* indices of destination
479
for (j=0; j<andim; j++) {
480
dst_idx_ptr[i*bndim+j] = idx % ald[j] + alo[j];
484
/* move the data block to create a new block */
485
/* linearize the data indices */
487
for (j=0; j<bndim; j++)
488
idx += (src_idx_ptr[i*bndim+j]) * factor_data[j];
490
/* adjust the position
491
* base: starting address of the first element */
494
/* move the element to the temporary location */
496
case C_DBL: ((double*)tmp_ptr)[i] =
497
((double*)src_data_ptr)[idx];
500
((int *)tmp_ptr)[i] = ((int *)src_data_ptr)[idx];
502
case C_DCPL:((DoubleComplex *)tmp_ptr)[i] =
503
((DoubleComplex *)src_data_ptr)[idx];
505
case C_SCPL:((SingleComplex *)tmp_ptr)[i] =
506
((SingleComplex *)src_data_ptr)[idx];
508
case C_FLOAT: ((float *)tmp_ptr)[i] =
509
((float *)src_data_ptr)[idx];
511
case C_LONG: ((long *)tmp_ptr)[i] =
512
((long *)src_data_ptr)[idx];
514
case C_LONGLONG: ((long long *)tmp_ptr)[i] =
515
((long long *)src_data_ptr)[idx];
518
pnga_release(g_b, los, his);
519
pnga_gather(g_a, tmp_ptr, dst_idx_ptr, 0, nelem);
520
ga_free(dst_idx_ptr);
521
ga_free(src_idx_ptr);
527
Integer offset, last, jtot;
528
for (i=0; i<andim; i++) {
529
ald[i] = ahi[i] - alo[i] + 1;
531
for (i=0; i<bndim; i++) {
532
bld[i] = bhi[i] - blo[i] + 1;
535
/* Array a is block-cyclic distributed */
536
if (num_blocks_a >= 0) {
537
/* Uses simple block-cyclic data distribution */
538
if (!pnga_uses_proc_grid(g_a)) {
539
for (i = me_a; i < num_blocks_a; i += anproc) {
540
pnga_distribution(g_a, i, los, his);
541
/* make temporory copies of los, his since ngai_patch_intersection
542
destroys original versions */
543
for (j=0; j < andim; j++) {
547
if (pnga_patch_intersect(alo,ahi,los,his,andim)) {
548
pnga_access_block_ptr(g_a, i, &src_data_ptr, ld);
552
for (j=0; j<last; j++) {
553
offset += (los[j]-lod[j])*jtot;
556
offset += (los[last]-lod[last])*jtot;
559
src_data_ptr = (void*)((double*)(src_data_ptr) + offset);
562
src_data_ptr = (void*)((int*)(src_data_ptr) + offset);
565
src_data_ptr = (void*)((DoubleComplex*)(src_data_ptr) + offset);
568
src_data_ptr = (void*)((SingleComplex*)(src_data_ptr) + offset);
571
src_data_ptr = (void*)((float*)(src_data_ptr) + offset);
574
src_data_ptr = (void*)((long*)(src_data_ptr) + offset);
577
src_data_ptr = (void*)((long long*)(src_data_ptr) + offset);
582
snga_dest_indices(andim, los, alo, ald, bndim, lod, blo, bld);
583
snga_dest_indices(andim, his, alo, ald, bndim, hid, blo, bld);
584
pnga_put(g_b, lod, hid, src_data_ptr, ld);
585
pnga_release_block(g_a, i);
589
/* Uses scalapack block-cyclic data distribution */
590
Integer proc_index[MAXDIM], index[MAXDIM];
591
Integer topology[MAXDIM];
592
Integer blocks[MAXDIM], block_dims[MAXDIM];
593
pnga_get_proc_index(g_a, me_a, proc_index);
594
pnga_get_proc_index(g_a, me_a, index);
595
pnga_get_block_info(g_a, blocks, block_dims);
596
pnga_get_proc_grid(g_a, topology);
597
while (index[andim-1] < blocks[andim-1]) {
598
/* find bounding coordinates of block */
600
for (i = 0; i < andim; i++) {
601
los[i] = index[i]*block_dims[i]+1;
602
his[i] = (index[i] + 1)*block_dims[i];
603
if (his[i] > adims[i]) his[i] = adims[i];
604
/*if (his[i] < los[i]) chk = 0;*/
606
/* make temporory copies of los, his since ngai_patch_intersection
607
destroys original versions */
608
for (j=0; j < andim; j++) {
612
if (pnga_patch_intersect(alo,ahi,los,his,andim)) {
613
pnga_access_block_grid_ptr(g_a, index, &src_data_ptr, ld);
617
for (j=0; j<last; j++) {
618
offset += (los[j]-lod[j])*jtot;
621
offset += (los[last]-lod[last])*jtot;
624
src_data_ptr = (void*)((double*)(src_data_ptr) + offset);
627
src_data_ptr = (void*)((int*)(src_data_ptr) + offset);
630
src_data_ptr = (void*)((DoubleComplex*)(src_data_ptr) + offset);
633
src_data_ptr = (void*)((SingleComplex*)(src_data_ptr) + offset);
636
src_data_ptr = (void*)((float*)(src_data_ptr) + offset);
639
src_data_ptr = (void*)((long*)(src_data_ptr) + offset);
642
src_data_ptr = (void*)((long long*)(src_data_ptr) + offset);
647
snga_dest_indices(andim, los, alo, ald, bndim, lod, blo, bld);
648
snga_dest_indices(andim, his, alo, ald, bndim, hid, blo, bld);
649
pnga_put(g_b, lod, hid, src_data_ptr, ld);
650
pnga_release_block_grid(g_a, index);
653
/* increment index to get next block on processor */
654
index[0] += topology[0];
655
for (i = 0; i < andim; i++) {
656
if (index[i] >= blocks[i] && i<andim-1) {
657
index[i] = proc_index[i];
658
index[i+1] += topology[i+1];
664
/* Array b is block-cyclic distributed */
665
pnga_distribution(g_a, me_a, los, his);
666
if (pnga_patch_intersect(alo,ahi,los,his,andim)) {
667
pnga_access_ptr(g_a, los, his, &src_data_ptr, ld);
668
snga_dest_indices(andim, los, alo, ald, bndim, lod, blo, bld);
669
snga_dest_indices(andim, his, alo, ald, bndim, hid, blo, bld);
670
pnga_put(g_b, lod, hid, src_data_ptr, ld);
671
pnga_release(g_a, los, his);
675
/* Array b is block-cyclic distributed */
676
if (num_blocks_b >= 0) {
677
/* Uses simple block-cyclic data distribution */
678
if (!pnga_uses_proc_grid(g_b)) {
679
for (i = me_b; i < num_blocks_b; i += bnproc) {
680
pnga_distribution(g_b, i, los, his);
681
/* make temporory copies of los, his since ngai_patch_intersection
682
destroys original versions */
683
for (j=0; j < andim; j++) {
687
if (pnga_patch_intersect(blo,bhi,los,his,andim)) {
688
pnga_access_block_ptr(g_b, i, &src_data_ptr, ld);
692
for (j=0; j<last; j++) {
693
offset += (los[j]-lod[j])*jtot;
696
offset += (los[last]-lod[last])*jtot;
699
src_data_ptr = (void*)((double*)(src_data_ptr) + offset);
702
src_data_ptr = (void*)((int*)(src_data_ptr) + offset);
705
src_data_ptr = (void*)((DoubleComplex*)(src_data_ptr) + offset);
708
src_data_ptr = (void*)((SingleComplex*)(src_data_ptr) + offset);
711
src_data_ptr = (void*)((float*)(src_data_ptr) + offset);
714
src_data_ptr = (void*)((long*)(src_data_ptr) + offset);
717
src_data_ptr = (void*)((long long*)(src_data_ptr) + offset);
722
snga_dest_indices(bndim, los, blo, bld, andim, lod, alo, ald);
723
snga_dest_indices(bndim, his, blo, bld, andim, hid, alo, ald);
724
pnga_get(g_a, lod, hid, src_data_ptr, ld);
725
pnga_release_block(g_b, i);
729
/* Uses scalapack block-cyclic data distribution */
730
Integer proc_index[MAXDIM], index[MAXDIM];
731
Integer topology[MAXDIM];
732
Integer blocks[MAXDIM], block_dims[MAXDIM];
733
pnga_get_proc_index(g_b, me_b, proc_index);
734
pnga_get_proc_index(g_b, me_b, index);
735
pnga_get_block_info(g_b, blocks, block_dims);
736
pnga_get_proc_grid(g_b, topology);
737
while (index[bndim-1] < blocks[bndim-1]) {
738
/* find bounding coordinates of block */
740
for (i = 0; i < bndim; i++) {
741
los[i] = index[i]*block_dims[i]+1;
742
his[i] = (index[i] + 1)*block_dims[i];
743
if (his[i] > bdims[i]) his[i] = bdims[i];
744
/*if (his[i] < los[i]) chk = 0;*/
746
/* make temporory copies of los, his since ngai_patch_intersection
747
destroys original versions */
748
for (j=0; j < andim; j++) {
752
if (pnga_patch_intersect(blo,bhi,los,his,andim)) {
753
pnga_access_block_grid_ptr(g_b, index, &src_data_ptr, ld);
757
for (j=0; j<last; j++) {
758
offset += (los[j]-lod[j])*jtot;
761
offset += (los[last]-lod[last])*jtot;
764
src_data_ptr = (void*)((double*)(src_data_ptr) + offset);
767
src_data_ptr = (void*)((int*)(src_data_ptr) + offset);
770
src_data_ptr = (void*)((DoubleComplex*)(src_data_ptr) + offset);
773
src_data_ptr = (void*)((SingleComplex*)(src_data_ptr) + offset);
776
src_data_ptr = (void*)((float*)(src_data_ptr) + offset);
779
src_data_ptr = (void*)((long*)(src_data_ptr) + offset);
782
src_data_ptr = (void*)((long long*)(src_data_ptr) + offset);
787
snga_dest_indices(bndim, los, blo, bld, andim, lod, alo, ald);
788
snga_dest_indices(bndim, his, blo, bld, andim, hid, alo, ald);
789
pnga_get(g_a, lod, hid, src_data_ptr, ld);
790
pnga_release_block_grid(g_b, index);
793
/* increment index to get next block on processor */
794
index[0] += topology[0];
795
for (i = 0; i < bndim; i++) {
796
if (index[i] >= blocks[i] && i<bndim-1) {
797
index[i] = proc_index[i];
798
index[i+1] += topology[i+1];
804
/* Array a is block-cyclic distributed */
805
pnga_distribution(g_b, me_b, los, his);
806
if (pnga_patch_intersect(blo,bhi,los,his,bndim)) {
807
pnga_access_ptr(g_b, los, his, &src_data_ptr, ld);
808
snga_dest_indices(bndim, los, blo, bld, andim, lod, alo, ald);
809
snga_dest_indices(bndim, his, blo, bld, andim, hid, alo, ald);
810
pnga_get(g_a, lod, hid, src_data_ptr, ld);
811
pnga_release(g_b, los, his);
819
if (anproc <= bnproc) {
820
pnga_pgroup_sync(a_grp);
821
} else if (a_grp == pnga_pgroup_get_world() &&
822
b_grp == pnga_pgroup_get_world()) {
825
pnga_pgroup_sync(b_grp);
831
static void snga_dot_local_patch(Integer atype, Integer andim, Integer *loA,
832
Integer *hiA, Integer *ldA, void *A_ptr, void *B_ptr,
833
int *alen, void *retval)
842
Integer i, j, n1dim, idx;
843
Integer bvalue[MAXDIM], bunit[MAXDIM], baseldA[MAXDIM];
845
isum = 0; dsum = 0.; zsum.real = 0.; zsum.imag = 0.; fsum = 0;lsum=0;llsum=0;
846
csum.real = 0.; csum.imag = 0.;
848
/* number of n-element of the first dimension */
849
n1dim = 1; for(i=1; i<andim; i++) n1dim *= (hiA[i] - loA[i] + 1);
851
/* calculate the destination indices */
852
bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
853
/* baseldA[0] = ldA[0]
854
* baseldA[1] = ldA[0] * ldA[1]
855
* baseldA[2] = ldA[0] * ldA[1] * ldA[2] .....
857
baseldA[0] = ldA[0]; baseldA[1] = baseldA[0] *ldA[1];
858
for(i=2; i<andim; i++) {
860
bunit[i] = bunit[i-1] * (hiA[i-1] - loA[i-1] + 1);
861
baseldA[i] = baseldA[i-1] * ldA[i];
864
/* compute "local" contribution to the dot product */
867
for(i=0; i<n1dim; i++) {
869
for(j=1; j<andim; j++) {
870
idx += bvalue[j] * baseldA[j-1];
871
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
872
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
875
for(j=0; j<(hiA[0]-loA[0]+1); j++)
876
isum += ((int *)A_ptr)[idx+j] *
877
((int *)B_ptr)[idx+j];
879
*(int*)retval += isum;
882
for(i=0; i<n1dim; i++) {
884
for(j=1; j<andim; j++) {
885
idx += bvalue[j] * baseldA[j-1];
886
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
887
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
890
for(j=0; j<(hiA[0]-loA[0]+1); j++) {
891
DoubleComplex a = ((DoubleComplex *)A_ptr)[idx+j];
892
DoubleComplex b = ((DoubleComplex *)B_ptr)[idx+j];
893
zsum.real += a.real*b.real - b.imag * a.imag;
894
zsum.imag += a.imag*b.real + b.imag * a.real;
897
((double*)retval)[0] += zsum.real;
898
((double*)retval)[1] += zsum.imag;
901
for(i=0; i<n1dim; i++) {
903
for(j=1; j<andim; j++) {
904
idx += bvalue[j] * baseldA[j-1];
905
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
906
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
909
for(j=0; j<(hiA[0]-loA[0]+1); j++) {
910
SingleComplex a = ((SingleComplex *)A_ptr)[idx+j];
911
SingleComplex b = ((SingleComplex *)B_ptr)[idx+j];
912
csum.real += a.real*b.real - b.imag * a.imag;
913
csum.imag += a.imag*b.real + b.imag * a.real;
916
((float*)retval)[0] += csum.real;
917
((float*)retval)[1] += csum.imag;
920
for(i=0; i<n1dim; i++) {
922
for(j=1; j<andim; j++) {
923
idx += bvalue[j] * baseldA[j-1];
924
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
925
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
928
for(j=0; j<(hiA[0]-loA[0]+1); j++)
929
dsum += ((double*)A_ptr)[idx+j] *
930
((double*)B_ptr)[idx+j];
932
*(double*)retval += dsum;
935
for(i=0; i<n1dim; i++) {
937
for(j=1; j<andim; j++) {
938
idx += bvalue[j] * baseldA[j-1];
939
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
940
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
943
for(j=0; j<(hiA[0]-loA[0]+1); j++)
944
fsum += ((float *)A_ptr)[idx+j] *
945
((float *)B_ptr)[idx+j];
947
*(float*)retval += fsum;
950
for(i=0; i<n1dim; i++) {
952
for(j=1; j<andim; j++) {
953
idx += bvalue[j] * baseldA[j-1];
954
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
955
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
958
for(j=0; j<(hiA[0]-loA[0]+1); j++)
959
lsum += ((long *)A_ptr)[idx+j] *
960
((long *)B_ptr)[idx+j];
962
*(long*)retval += lsum;
965
for(i=0; i<n1dim; i++) {
967
for(j=1; j<andim; j++) {
968
idx += bvalue[j] * baseldA[j-1];
969
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
970
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
973
for(j=0; j<(hiA[0]-loA[0]+1); j++)
974
llsum += ((long long *)A_ptr)[idx+j] *
975
((long long *)B_ptr)[idx+j];
977
*(long long*)retval += llsum;
980
pnga_error("snga_dot_local_patch: type not supported",atype);
986
/*\ generic dot product routine
988
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
989
# pragma weak wnga_dot_patch = pnga_dot_patch
991
void pnga_dot_patch(Integer g_a, char *t_a, Integer *alo, Integer *ahi, Integer g_b, char *t_b, Integer *blo, Integer *bhi, void *retval)
994
Integer compatible=0;
995
Integer atype=0, btype=0, andim=0, adims[MAXDIM], bndim=0, bdims[MAXDIM];
996
Integer loA[MAXDIM], hiA[MAXDIM], ldA[MAXDIM];
997
Integer loB[MAXDIM], hiB[MAXDIM], ldB[MAXDIM];
998
Integer g_A = g_a, g_B = g_b;
999
void *A_ptr=NULL, *B_ptr=NULL;
1001
Integer atotal=0, btotal=0;
1006
DoubleComplex zsum={0,0};
1007
DoubleComplex csum={0,0};
1009
Integer me= pnga_nodeid(), temp_created=0;
1010
Integer nproc = pnga_nnodes();
1011
Integer num_blocks_a=0, num_blocks_b=0;
1012
char *tempname = "temp", transp, transp_a, transp_b;
1013
int local_sync_begin=0;
1014
Integer a_grp=0, b_grp=0;
1016
local_sync_begin = _ga_sync_begin;
1017
_ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
1018
if(local_sync_begin)pnga_sync();
1020
GA_PUSH_NAME("pnga_dot_patch");
1021
a_grp = pnga_get_pgroup(g_a);
1022
b_grp = pnga_get_pgroup(g_b);
1024
pnga_error("Both arrays must be defined on same group",0L);
1025
me = pnga_pgroup_nodeid(a_grp);
1027
pnga_inquire(g_a, &atype, &andim, adims);
1028
pnga_inquire(g_b, &btype, &bndim, bdims);
1030
if(atype != btype ) pnga_error(" type mismatch ", 0L);
1032
/* check if patch indices and g_a dims match */
1033
for(i=0; i<andim; i++)
1034
if(alo[i] <= 0 || ahi[i] > adims[i])
1035
pnga_error("g_a indices out of range ", g_a);
1036
for(i=0; i<bndim; i++)
1037
if(blo[i] <= 0 || bhi[i] > bdims[i])
1038
pnga_error("g_b indices out of range ", g_b);
1040
/* check if numbers of elements in two patches match each other */
1041
atotal = 1; for(i=0; i<andim; i++) atotal *= (ahi[i] - alo[i] + 1);
1042
btotal = 1; for(i=0; i<bndim; i++) btotal *= (bhi[i] - blo[i] + 1);
1044
if(atotal != btotal)
1045
pnga_error(" capacities of patches do not match ", 0L);
1047
/* is transpose operation required ? */
1048
/* -- only if for one array transpose operation requested*/
1049
transp_a = (*t_a == 'n' || *t_a =='N')? 'n' : 't';
1050
transp_b = (*t_b == 'n' || *t_b =='N')? 'n' : 't';
1051
transp = (transp_a == transp_b)? 'n' : 't';
1053
/* Find out if distribution is block-cyclic */
1054
num_blocks_a = pnga_total_blocks(g_a);
1055
num_blocks_b = pnga_total_blocks(g_b);
1057
if (num_blocks_a >= 0 || num_blocks_b >= 0) {
1058
if (transp_a == 't' || transp_b == 't')
1059
pnga_error("transpose not supported for block-cyclic data ", 0);
1062
isum = 0; dsum = 0.; zsum.real = 0.; zsum.imag = 0.; fsum = 0;lsum=0;llsum=0;
1063
csum.real = 0.; csum.imag = 0.;
1067
*(int*)retval = isum;
1071
((double*)retval)[0] = zsum.real;
1072
((double*)retval)[1] = zsum.imag;
1076
((float*)retval)[0] = csum.real;
1077
((float*)retval)[1] = csum.imag;
1081
*(double*)retval = dsum;
1085
*(float*)retval = fsum;
1089
*(long*)retval = lsum;
1093
*(long long*)retval = llsum;
1097
pnga_error("snga_dot_local_patch: type not supported",atype);
1100
if (num_blocks_a < 0 && num_blocks_b < 0) {
1101
/* find out coordinates of patches of g_A and g_B that I own */
1102
pnga_distribution(g_A, me, loA, hiA);
1103
pnga_distribution(g_B, me, loB, hiB);
1105
if(pnga_comp_patch(andim, loA, hiA, bndim, loB, hiB) &&
1106
pnga_comp_patch(andim, alo, ahi, bndim, blo, bhi)) compatible = 1;
1107
else compatible = 0;
1108
pnga_gop(pnga_type_f2c(MT_F_INT), &compatible, 1, "*");
1109
if(!(compatible && (transp=='n'))) {
1110
/* either patches or distributions do not match:
1111
* - create a temp array that matches distribution of g_a
1112
* - copy & reshape patch of g_b into g_B
1114
if (!pnga_duplicate(g_a, &g_B, tempname))
1115
pnga_error("duplicate failed",0L);
1117
pnga_copy_patch(&transp, g_b, blo, bhi, g_B, alo, ahi);
1120
pnga_distribution(g_B, me, loB, hiB);
1123
if(!pnga_comp_patch(andim, loA, hiA, bndim, loB, hiB))
1124
pnga_error(" patches mismatch ",0);
1126
/* A[83:125,1:1] <==> B[83:125] */
1127
if(andim > bndim) andim = bndim; /* need more work */
1129
/* determine subsets of my patches to access */
1130
if(pnga_patch_intersect(alo, ahi, loA, hiA, andim)){
1131
pnga_access_ptr(g_A, loA, hiA, &A_ptr, ldA);
1132
pnga_access_ptr(g_B, loA, hiA, &B_ptr, ldB);
1134
snga_dot_local_patch(atype, andim, loA, hiA, ldA, A_ptr, B_ptr,
1136
/* release access to the data */
1137
pnga_release(g_A, loA, hiA);
1138
pnga_release(g_B, loA, hiA);
1141
/* Create copy of g_b identical with identical distribution as g_a */
1142
if (!pnga_duplicate(g_a, &g_B, tempname))
1143
pnga_error("duplicate failed",0L);
1144
pnga_copy_patch(&transp, g_b, blo, bhi, g_B, alo, ahi);
1147
/* If g_a regular distribution, then just use normal dot product on patch */
1148
if (num_blocks_a < 0) {
1149
/* find out coordinates of patches of g_A and g_B that I own */
1150
pnga_distribution(g_A, me, loA, hiA);
1151
pnga_distribution(g_B, me, loB, hiB);
1153
if(!pnga_comp_patch(andim, loA, hiA, bndim, loB, hiB))
1154
pnga_error(" patches mismatch ",0);
1156
/* A[83:125,1:1] <==> B[83:125] */
1157
if(andim > bndim) andim = bndim; /* need more work */
1158
if(pnga_patch_intersect(alo, ahi, loA, hiA, andim)){
1159
pnga_access_ptr(g_A, loA, hiA, &A_ptr, ldA);
1160
pnga_access_ptr(g_B, loA, hiA, &B_ptr, ldB);
1162
snga_dot_local_patch(atype, andim, loA, hiA, ldA, A_ptr, B_ptr,
1164
/* release access to the data */
1165
pnga_release(g_A, loA, hiA);
1166
pnga_release(g_B, loA, hiA);
1169
Integer lo[MAXDIM]/*, hi[MAXDIM]*/;
1170
Integer offset, jtot, last;
1171
/* simple block cyclic data distribution */
1172
if (!pnga_uses_proc_grid(g_a)) {
1173
for (i=me; i<num_blocks_a; i += nproc) {
1174
pnga_distribution(g_A, i, loA, hiA);
1175
/* make copies of loA and hiA since pnga_patch_intersect destroys
1176
original versions */
1177
for (j=0; j<andim; j++) {
1181
if(pnga_patch_intersect(alo, ahi, loA, hiA, andim)){
1182
pnga_access_block_ptr(g_A, i, &A_ptr, ldA);
1183
pnga_access_block_ptr(g_B, i, &B_ptr, ldB);
1185
/* evaluate offsets for system */
1189
for (j=0; j<last; j++) {
1190
offset += (loA[j] - lo[j])*jtot;
1193
offset += (loA[last]-lo[last])*jtot;
1195
/* offset pointers by correct amount */
1198
A_ptr = (void*)((int*)(A_ptr) + offset);
1199
B_ptr = (void*)((int*)(B_ptr) + offset);
1202
A_ptr = (void*)((DoubleComplex*)(A_ptr) + offset);
1203
B_ptr = (void*)((DoubleComplex*)(B_ptr) + offset);
1206
A_ptr = (void*)((SingleComplex*)(A_ptr) + offset);
1207
B_ptr = (void*)((SingleComplex*)(B_ptr) + offset);
1210
A_ptr = (void*)((double*)(A_ptr) + offset);
1211
B_ptr = (void*)((double*)(B_ptr) + offset);
1214
A_ptr = (void*)((float*)(A_ptr) + offset);
1215
B_ptr = (void*)((float*)(B_ptr) + offset);
1218
A_ptr = (void*)((long*)(A_ptr) + offset);
1219
B_ptr = (void*)((long*)(B_ptr) + offset);
1222
A_ptr = (void*)((long long*)(A_ptr) + offset);
1223
B_ptr = (void*)((long long*)(B_ptr) + offset);
1226
snga_dot_local_patch(atype, andim, loA, hiA, ldA, A_ptr, B_ptr,
1228
/* release access to the data */
1229
pnga_release_block(g_A, i);
1230
pnga_release_block(g_B, i);
1234
/* Uses scalapack block-cyclic data distribution */
1235
Integer proc_index[MAXDIM], index[MAXDIM];
1236
Integer topology[MAXDIM]/*, chk*/;
1237
Integer blocks[MAXDIM], block_dims[MAXDIM];
1238
pnga_get_proc_index(g_a, me, proc_index);
1239
pnga_get_proc_index(g_a, me, index);
1240
pnga_get_block_info(g_a, blocks, block_dims);
1241
pnga_get_proc_grid(g_a, topology);
1242
while (index[andim-1] < blocks[andim-1]) {
1243
/* find bounding coordinates of block */
1245
for (i = 0; i < andim; i++) {
1246
loA[i] = index[i]*block_dims[i]+1;
1247
hiA[i] = (index[i] + 1)*block_dims[i];
1248
if (hiA[i] > adims[i]) hiA[i] = adims[i];
1249
/*if (hiA[i] < loA[i]) chk = 0;*/
1251
/* make copies of loA and hiA since pnga_patch_intersect destroys
1252
original versions */
1253
for (j=0; j<andim; j++) {
1257
if(pnga_patch_intersect(alo, ahi, loA, hiA, andim)){
1258
pnga_access_block_grid_ptr(g_A, index, &A_ptr, ldA);
1259
pnga_access_block_grid_ptr(g_B, index, &B_ptr, ldB);
1261
/* evaluate offsets for system */
1265
for (j=0; j<last; j++) {
1266
offset += (loA[j] - lo[j])*jtot;
1269
offset += (loA[last]-lo[last])*jtot;
1271
/* offset pointers by correct amount */
1274
A_ptr = (void*)((int*)(A_ptr) + offset);
1275
B_ptr = (void*)((int*)(B_ptr) + offset);
1278
A_ptr = (void*)((DoubleComplex*)(A_ptr) + offset);
1279
B_ptr = (void*)((DoubleComplex*)(B_ptr) + offset);
1282
A_ptr = (void*)((SingleComplex*)(A_ptr) + offset);
1283
B_ptr = (void*)((SingleComplex*)(B_ptr) + offset);
1286
A_ptr = (void*)((double*)(A_ptr) + offset);
1287
B_ptr = (void*)((double*)(B_ptr) + offset);
1290
A_ptr = (void*)((float*)(A_ptr) + offset);
1291
B_ptr = (void*)((float*)(B_ptr) + offset);
1294
A_ptr = (void*)((long*)(A_ptr) + offset);
1295
B_ptr = (void*)((long*)(B_ptr) + offset);
1298
A_ptr = (void*)((long long*)(A_ptr) + offset);
1299
B_ptr = (void*)((long long*)(B_ptr) + offset);
1302
snga_dot_local_patch(atype, andim, loA, hiA, ldA, A_ptr, B_ptr,
1304
/* release access to the data */
1305
pnga_release_block_grid(g_A, index);
1306
pnga_release_block_grid(g_B, index);
1309
/* increment index to get next block on processor */
1310
index[0] += topology[0];
1311
for (i = 0; i < andim; i++) {
1312
if (index[i] >= blocks[i] && i<andim-1) {
1313
index[i] = proc_index[i];
1314
index[i+1] += topology[i+1];
1322
/*convert from C data type to ARMCI type */
1324
case C_FLOAT: ctype=ARMCI_FLOAT; break;
1325
case C_DBL: ctype=ARMCI_DOUBLE; break;
1326
case C_INT: ctype=ARMCI_INT; break;
1327
case C_LONG: ctype=ARMCI_LONG; break;
1328
case C_LONGLONG: ctype=ARMCI_LONG_LONG; break;
1329
case C_DCPL: ctype=ARMCI_DOUBLE; break;
1330
case C_SCPL: ctype=ARMCI_FLOAT; break;
1331
default: pnga_error("pnga_dot_patch: type not supported",atype);
1334
if (pnga_is_mirrored(g_a) && pnga_is_mirrored(g_b)) {
1335
armci_msg_gop_scope(SCOPE_NODE,retval,alen,"+",ctype);
1338
armci_msg_gop_scope(SCOPE_ALL,retval,alen,"+",ctype);
1341
armci_msg_group_gop_scope(SCOPE_ALL,retval,alen,"+",ctype,
1342
ga_get_armci_group_((int)a_grp));
1347
if(temp_created) pnga_destroy(g_B);
1352
/*****************************************************************************
1353
* ngai_Xdot_patch routines
1354
****************************************************************************/
1358
* Set all values in patch to value stored in *val
1360
static void snga_set_patch_value(
1361
Integer type, Integer ndim, Integer *loA, Integer *hiA,
1362
Integer *ld, void *data_ptr, void *val)
1364
Integer n1dim, i, j, idx;
1365
Integer bvalue[MAXDIM], bunit[MAXDIM], baseld[MAXDIM];
1366
/* number of n-element of the first dimension */
1367
n1dim = 1; for(i=1; i<ndim; i++) n1dim *= (hiA[i] - loA[i] + 1);
1369
/* calculate the destination indices */
1370
bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
1371
/* baseld[0] = ld[0]
1372
* baseld[1] = ld[0] * ld[1]
1373
* baseld[2] = ld[0] * ld[1] * ld[2] .....
1375
baseld[0] = ld[0]; baseld[1] = baseld[0] *ld[1];
1376
for(i=2; i<ndim; i++) {
1378
bunit[i] = bunit[i-1] * (hiA[i-1] - loA[i-1] + 1);
1379
baseld[i] = baseld[i-1] * ld[i];
1384
for(i=0; i<n1dim; i++) {
1386
for(j=1; j<ndim; j++) {
1387
idx += bvalue[j] * baseld[j-1];
1388
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1389
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1391
for(j=0; j<(hiA[0]-loA[0]+1); j++)
1392
((int *)data_ptr)[idx+j] = *(int*)val;
1396
for(i=0; i<n1dim; i++) {
1398
for(j=1; j<ndim; j++) {
1399
idx += bvalue[j] * baseld[j-1];
1400
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1401
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1404
for(j=0; j<(hiA[0]-loA[0]+1); j++) {
1405
DoubleComplex tmp = *(DoubleComplex *)val;
1406
((DoubleComplex *)data_ptr)[idx+j].real = tmp.real;
1407
((DoubleComplex *)data_ptr)[idx+j].imag = tmp.imag;
1412
for(i=0; i<n1dim; i++) {
1414
for(j=1; j<ndim; j++) {
1415
idx += bvalue[j] * baseld[j-1];
1416
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1417
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1420
for(j=0; j<(hiA[0]-loA[0]+1); j++) {
1421
SingleComplex tmp = *(SingleComplex *)val;
1422
((SingleComplex *)data_ptr)[idx+j].real = tmp.real;
1423
((SingleComplex *)data_ptr)[idx+j].imag = tmp.imag;
1428
for(i=0; i<n1dim; i++) {
1430
for(j=1; j<ndim; j++) {
1431
idx += bvalue[j] * baseld[j-1];
1432
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1433
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1436
for(j=0; j<(hiA[0]-loA[0]+1); j++)
1437
((double*)data_ptr)[idx+j] =
1442
for(i=0; i<n1dim; i++) {
1444
for(j=1; j<ndim; j++) {
1445
idx += bvalue[j] * baseld[j-1];
1446
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1447
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1449
for(j=0; j<(hiA[0]-loA[0]+1); j++)
1450
((float *)data_ptr)[idx+j] = *(float*)val;
1454
for(i=0; i<n1dim; i++) {
1456
for(j=1; j<ndim; j++) {
1457
idx += bvalue[j] * baseld[j-1];
1458
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1459
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1461
for(j=0; j<(hiA[0]-loA[0]+1); j++)
1462
((long *)data_ptr)[idx+j] = *(long*)val;
1466
for(i=0; i<n1dim; i++) {
1468
for(j=1; j<ndim; j++) {
1469
idx += bvalue[j] * baseld[j-1];
1470
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1471
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1473
for(j=0; j<(hiA[0]-loA[0]+1); j++)
1474
((long long*)data_ptr)[idx+j] = *(long long*)val;
1477
default: pnga_error(" wrong data type ",type);
1483
/*\ FILL IN ARRAY WITH VALUE
1485
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
1486
# pragma weak wnga_fill_patch = pnga_fill_patch
1488
void pnga_fill_patch(Integer g_a, Integer *lo, Integer *hi, void* val)
1491
Integer ndim, dims[MAXDIM], type;
1492
Integer loA[MAXDIM], hiA[MAXDIM], ld[MAXDIM];
1494
Integer num_blocks, nproc;
1495
Integer me= pnga_nodeid();
1496
int local_sync_begin,local_sync_end;
1498
local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
1499
_ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
1500
if(local_sync_begin)pnga_sync();
1502
GA_PUSH_NAME("nga_fill_patch");
1504
pnga_inquire(g_a, &type, &ndim, dims);
1505
num_blocks = pnga_total_blocks(g_a);
1507
if (num_blocks < 0) {
1508
/* get limits of VISIBLE patch */
1509
pnga_distribution(g_a, me, loA, hiA);
1511
/* determine subset of my local patch to access */
1512
/* Output is in loA and hiA */
1513
if(pnga_patch_intersect(lo, hi, loA, hiA, ndim)){
1515
/* get data_ptr to corner of patch */
1516
/* ld are leading dimensions INCLUDING ghost cells */
1517
pnga_access_ptr(g_a, loA, hiA, &data_ptr, ld);
1519
/* set all values in patch to *val */
1520
snga_set_patch_value(type, ndim, loA, hiA, ld, data_ptr, val);
1522
/* release access to the data */
1523
pnga_release_update(g_a, loA, hiA);
1526
Integer offset, j, jtmp, chk;
1527
Integer loS[MAXDIM];
1528
nproc = pnga_nnodes();
1529
/* using simple block-cyclic data distribution */
1530
if (!pnga_uses_proc_grid(g_a)){
1531
for (i=me; i<num_blocks; i += nproc) {
1532
/* get limits of patch */
1533
pnga_distribution(g_a, i, loA, hiA);
1535
/* loA is changed by pnga_patch_intersect, so
1537
for (j=0; j<ndim; j++) {
1541
/* determine subset of my local patch to access */
1542
/* Output is in loA and hiA */
1543
if(pnga_patch_intersect(lo, hi, loA, hiA, ndim)){
1545
/* get data_ptr to corner of patch */
1546
/* ld are leading dimensions for block */
1547
pnga_access_block_ptr(g_a, i, &data_ptr, ld);
1549
/* Check for partial overlap */
1551
for (j=0; j<ndim; j++) {
1552
if (loS[j] < loA[j]) {
1558
/* Evaluate additional offset for pointer */
1561
for (j=0; j<ndim-1; j++) {
1562
offset += (loA[j]-loS[j])*jtmp;
1565
offset += (loA[ndim-1]-loS[ndim-1])*jtmp;
1568
data_ptr = (void*)((int*)data_ptr + offset);
1571
data_ptr = (void*)((double*)data_ptr + 2*offset);
1574
data_ptr = (void*)((float*)data_ptr + 2*offset);
1577
data_ptr = (void*)((double*)data_ptr + offset);
1580
data_ptr = (void*)((float*)data_ptr + offset);
1583
data_ptr = (void*)((long*)data_ptr + offset);
1586
data_ptr = (void*)((long long*)data_ptr + offset);
1588
default: pnga_error(" wrong data type ",type);
1592
/* set all values in patch to *val */
1593
snga_set_patch_value(type, ndim, loA, hiA, ld, data_ptr, val);
1595
/* release access to the data */
1596
pnga_release_update_block(g_a, i);
1600
/* using scalapack block-cyclic data distribution */
1601
Integer proc_index[MAXDIM], index[MAXDIM];
1602
Integer topology[MAXDIM];
1603
Integer blocks[MAXDIM], block_dims[MAXDIM];
1604
pnga_get_proc_index(g_a, me, proc_index);
1605
pnga_get_proc_index(g_a, me, index);
1606
pnga_get_block_info(g_a, blocks, block_dims);
1607
pnga_get_proc_grid(g_a, topology);
1608
while (index[ndim-1] < blocks[ndim-1]) {
1609
/* find bounding coordinates of block */
1611
for (i = 0; i < ndim; i++) {
1612
loA[i] = index[i]*block_dims[i]+1;
1613
hiA[i] = (index[i] + 1)*block_dims[i];
1614
if (hiA[i] > dims[i]) hiA[i] = dims[i];
1615
if (hiA[i] < loA[i]) chk = 0;
1618
/* loA is changed by pnga_patch_intersect, so
1620
for (j=0; j<ndim; j++) {
1624
/* determine subset of my local patch to access */
1625
/* Output is in loA and hiA */
1626
if(pnga_patch_intersect(lo, hi, loA, hiA, ndim)){
1628
/* get data_ptr to corner of patch */
1629
/* ld are leading dimensions for block */
1630
pnga_access_block_grid_ptr(g_a, index, &data_ptr, ld);
1632
/* Check for partial overlap */
1634
for (j=0; j<ndim; j++) {
1635
if (loS[j] < loA[j]) {
1641
/* Evaluate additional offset for pointer */
1644
for (j=0; j<ndim-1; j++) {
1645
offset += (loA[j]-loS[j])*jtmp;
1648
offset += (loA[ndim-1]-loS[ndim-1])*jtmp;
1651
data_ptr = (void*)((int*)data_ptr + offset);
1654
data_ptr = (void*)((double*)data_ptr + 2*offset);
1657
data_ptr = (void*)((float*)data_ptr + 2*offset);
1660
data_ptr = (void*)((double*)data_ptr + offset);
1663
data_ptr = (void*)((float*)data_ptr + offset);
1666
data_ptr = (void*)((long*)data_ptr + offset);
1669
data_ptr = (void*)((long long*)data_ptr + offset);
1671
default: pnga_error(" wrong data type ",type);
1675
/* set all values in patch to *val */
1676
snga_set_patch_value(type, ndim, loA, hiA, ld, data_ptr, val);
1678
/* release access to the data */
1679
pnga_release_update_block_grid(g_a, index);
1681
/* increment index to get next block on processor */
1682
index[0] += topology[0];
1683
for (i = 0; i < ndim; i++) {
1684
if (index[i] >= blocks[i] && i<ndim-1) {
1685
index[i] = proc_index[i];
1686
index[i+1] += topology[i+1];
1693
if(local_sync_end)pnga_sync();
1697
static void snga_scale_patch_value(Integer type, Integer ndim, Integer *loA, Integer *hiA,
1698
Integer *ld, void *src_data_ptr, void *alpha)
1700
Integer n1dim, i, j, idx;
1701
Integer bvalue[MAXDIM], bunit[MAXDIM], baseld[MAXDIM];
1702
DoublePrecision tmp1_real, tmp1_imag, tmp2_real, tmp2_imag;
1703
float ftmp1_real, ftmp1_imag, ftmp2_real, ftmp2_imag;
1704
/* number of n-element of the first dimension */
1705
n1dim = 1; for(i=1; i<ndim; i++) n1dim *= (hiA[i] - loA[i] + 1);
1707
/* calculate the destination indices */
1708
bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
1709
/* baseld[0] = ld[0]
1710
* baseld[1] = ld[0] * ld[1]
1711
* baseld[2] = ld[0] * ld[1] * ld[2] .....
1713
baseld[0] = ld[0]; baseld[1] = baseld[0] *ld[1];
1714
for(i=2; i<ndim; i++) {
1716
bunit[i] = bunit[i-1] * (hiA[i-1] - loA[i-1] + 1);
1717
baseld[i] = baseld[i-1] * ld[i];
1720
/* scale local part of g_a */
1723
for(i=0; i<n1dim; i++) {
1725
for(j=1; j<ndim; j++) {
1726
idx += bvalue[j] * baseld[j-1];
1727
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1728
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1731
for(j=0; j<(hiA[0]-loA[0]+1); j++)
1732
((double*)src_data_ptr)[idx+j] *=
1737
for(i=0; i<n1dim; i++) {
1739
for(j=1; j<ndim; j++) {
1740
idx += bvalue[j] * baseld[j-1];
1741
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1742
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1745
for(j=0; j<(hiA[0]-loA[0]+1); j++) {
1746
tmp1_real =((DoubleComplex *)src_data_ptr)[idx+j].real;
1747
tmp1_imag =((DoubleComplex *)src_data_ptr)[idx+j].imag;
1748
tmp2_real = (*(DoubleComplex*)alpha).real;
1749
tmp2_imag = (*(DoubleComplex*)alpha).imag;
1751
((DoubleComplex *)src_data_ptr)[idx+j].real =
1752
tmp1_real*tmp2_real - tmp1_imag * tmp2_imag;
1753
((DoubleComplex *)src_data_ptr)[idx+j].imag =
1754
tmp2_imag*tmp1_real + tmp1_imag * tmp2_real;
1759
for(i=0; i<n1dim; i++) {
1761
for(j=1; j<ndim; j++) {
1762
idx += bvalue[j] * baseld[j-1];
1763
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1764
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1767
for(j=0; j<(hiA[0]-loA[0]+1); j++) {
1768
ftmp1_real =((SingleComplex *)src_data_ptr)[idx+j].real;
1769
ftmp1_imag =((SingleComplex *)src_data_ptr)[idx+j].imag;
1770
ftmp2_real = (*(SingleComplex*)alpha).real;
1771
ftmp2_imag = (*(SingleComplex*)alpha).imag;
1773
((SingleComplex *)src_data_ptr)[idx+j].real =
1774
ftmp1_real*ftmp2_real - ftmp1_imag * ftmp2_imag;
1775
((SingleComplex *)src_data_ptr)[idx+j].imag =
1776
ftmp2_imag*ftmp1_real + ftmp1_imag * ftmp2_real;
1781
for(i=0; i<n1dim; i++) {
1783
for(j=1; j<ndim; j++) {
1784
idx += bvalue[j] * baseld[j-1];
1785
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1786
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1789
for(j=0; j<(hiA[0]-loA[0]+1); j++)
1790
((int*)src_data_ptr)[idx+j] *= *(int*)alpha;
1794
for(i=0; i<n1dim; i++) {
1796
for(j=1; j<ndim; j++) {
1797
idx += bvalue[j] * baseld[j-1];
1798
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1799
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1802
for(j=0; j<(hiA[0]-loA[0]+1); j++)
1803
((long *)src_data_ptr)[idx+j] *= *(long*)alpha;
1807
for(i=0; i<n1dim; i++) {
1809
for(j=1; j<ndim; j++) {
1810
idx += bvalue[j] * baseld[j-1];
1811
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1812
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1815
for(j=0; j<(hiA[0]-loA[0]+1); j++)
1816
((long long*)src_data_ptr)[idx+j] *= *(long long*)alpha;
1820
for(i=0; i<n1dim; i++) {
1822
for(j=1; j<ndim; j++) {
1823
idx += bvalue[j] * baseld[j-1];
1824
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1825
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
1828
for(j=0; j<(hiA[0]-loA[0]+1); j++)
1829
((float *)src_data_ptr)[idx+j] *= *(float*)alpha;
1832
default: pnga_error(" wrong data type ",type);
1839
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
1840
# pragma weak wnga_scale_patch = pnga_scale_patch
1842
void pnga_scale_patch(Integer g_a, Integer *lo, Integer *hi, void *alpha)
1844
Integer ndim, dims[MAXDIM], type;
1845
Integer loA[MAXDIM], hiA[MAXDIM];
1848
Integer num_blocks, nproc;
1849
Integer me= pnga_nodeid();
1850
int local_sync_begin,local_sync_end;
1852
local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
1853
_ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
1854
if(local_sync_begin)pnga_sync();
1856
GA_PUSH_NAME("pnga_scale_patch");
1858
pnga_inquire(g_a, &type, &ndim, dims);
1859
num_blocks = pnga_total_blocks(g_a);
1861
if (num_blocks < 0) {
1862
pnga_distribution(g_a, me, loA, hiA);
1864
/* determine subset of my patch to access */
1865
if (pnga_patch_intersect(lo, hi, loA, hiA, ndim)){
1866
pnga_access_ptr(g_a, loA, hiA, &src_data_ptr, ld);
1868
snga_scale_patch_value(type, ndim, loA, hiA, ld, src_data_ptr, alpha);
1870
/* release access to the data */
1871
pnga_release_update(g_a, loA, hiA);
1874
Integer offset, i, j, jtmp, chk;
1875
Integer loS[MAXDIM];
1876
nproc = pnga_nnodes();
1877
/* using simple block-cyclic data distribution */
1878
if (!pnga_uses_proc_grid(g_a)){
1879
for (i=me; i<num_blocks; i += nproc) {
1880
/* get limits of VISIBLE patch */
1881
pnga_distribution(g_a, i, loA, hiA);
1883
/* loA is changed by pnga_patch_intersect, so
1885
for (j=0; j<ndim; j++) {
1889
/* determine subset of my local patch to access */
1890
/* Output is in loA and hiA */
1891
if(pnga_patch_intersect(lo, hi, loA, hiA, ndim)){
1893
/* get src_data_ptr to corner of patch */
1894
/* ld are leading dimensions INCLUDING ghost cells */
1895
pnga_access_block_ptr(g_a, i, &src_data_ptr, ld);
1897
/* Check for partial overlap */
1899
for (j=0; j<ndim; j++) {
1900
if (loS[j] < loA[j]) {
1906
/* Evaluate additional offset for pointer */
1909
for (j=0; j<ndim-1; j++) {
1910
offset += (loA[j]-loS[j])*jtmp;
1913
offset += (loA[ndim-1]-loS[ndim-1])*jtmp;
1916
src_data_ptr = (void*)((int*)src_data_ptr + offset);
1919
src_data_ptr = (void*)((double*)src_data_ptr + 2*offset);
1922
src_data_ptr = (void*)((float*)src_data_ptr + 2*offset);
1925
src_data_ptr = (void*)((double*)src_data_ptr + offset);
1928
src_data_ptr = (void*)((float*)src_data_ptr + offset);
1931
src_data_ptr = (void*)((long*)src_data_ptr + offset);
1934
src_data_ptr = (void*)((long long*)src_data_ptr + offset);
1936
default: pnga_error(" wrong data type ",type);
1940
/* set all values in patch to *val */
1941
snga_scale_patch_value(type, ndim, loA, hiA, ld, src_data_ptr, alpha);
1943
/* release access to the data */
1944
pnga_release_update_block(g_a, i);
1948
/* using scalapack block-cyclic data distribution */
1949
Integer proc_index[MAXDIM], index[MAXDIM];
1950
Integer topology[MAXDIM];
1951
Integer blocks[MAXDIM], block_dims[MAXDIM];
1952
pnga_get_proc_index(g_a, me, proc_index);
1953
pnga_get_proc_index(g_a, me, index);
1954
pnga_get_block_info(g_a, blocks, block_dims);
1955
pnga_get_proc_grid(g_a, topology);
1956
while (index[ndim-1] < blocks[ndim-1]) {
1957
/* find bounding coordinates of block */
1959
for (i = 0; i < ndim; i++) {
1960
loA[i] = index[i]*block_dims[i]+1;
1961
hiA[i] = (index[i] + 1)*block_dims[i];
1962
if (hiA[i] > dims[i]) hiA[i] = dims[i];
1963
if (hiA[i] < loA[i]) chk = 0;
1966
/* loA is changed by pnga_patch_intersect, so
1968
for (j=0; j<ndim; j++) {
1972
/* determine subset of my local patch to access */
1973
/* Output is in loA and hiA */
1974
if(pnga_patch_intersect(lo, hi, loA, hiA, ndim)){
1976
/* get data_ptr to corner of patch */
1977
/* ld are leading dimensions for block */
1978
pnga_access_block_grid_ptr(g_a, index, &src_data_ptr, ld);
1980
/* Check for partial overlap */
1982
for (j=0; j<ndim; j++) {
1983
if (loS[j] < loA[j]) {
1989
/* Evaluate additional offset for pointer */
1992
for (j=0; j<ndim-1; j++) {
1993
offset += (loA[j]-loS[j])*jtmp;
1996
offset += (loA[ndim-1]-loS[ndim-1])*jtmp;
1999
src_data_ptr = (void*)((int*)src_data_ptr + offset);
2002
src_data_ptr = (void*)((double*)src_data_ptr + 2*offset);
2005
src_data_ptr = (void*)((float*)src_data_ptr + 2*offset);
2008
src_data_ptr = (void*)((double*)src_data_ptr + offset);
2011
src_data_ptr = (void*)((float*)src_data_ptr + offset);
2014
src_data_ptr = (void*)((long*)src_data_ptr + offset);
2017
src_data_ptr = (void*)((long long*)src_data_ptr + offset);
2019
default: pnga_error(" wrong data type ",type);
2023
/* set all values in patch to *val */
2024
snga_scale_patch_value(type, ndim, loA, hiA, ld, src_data_ptr, alpha);
2026
/* release access to the data */
2027
pnga_release_update_block_grid(g_a, index);
2029
/* increment index to get next block on processor */
2030
index[0] += topology[0];
2031
for (i = 0; i < ndim; i++) {
2032
if (index[i] >= blocks[i] && i<ndim-1) {
2033
index[i] = proc_index[i];
2034
index[i+1] += topology[i+1];
2041
if(local_sync_end)pnga_sync();
2044
/*\ Utility function to accumulate patch values C = C + alpha*A
2046
static void snga_acc_patch_values(Integer type, void* alpha, Integer ndim,
2047
Integer *loC, Integer *hiC, Integer *ldC,
2048
void *A_ptr, void *C_ptr)
2050
Integer bvalue[MAXDIM], bunit[MAXDIM], baseldC[MAXDIM];
2053
/* compute "local" add */
2055
/* number of n-element of the first dimension */
2056
n1dim = 1; for(i=1; i<ndim; i++) n1dim *= (hiC[i] - loC[i] + 1);
2058
/* calculate the destination indices */
2059
bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
2060
/* baseld[0] = ld[0]
2061
* baseld[1] = ld[0] * ld[1]
2062
* baseld[2] = ld[0] * ld[1] * ld[2] .....
2064
baseldC[0] = ldC[0]; baseldC[1] = baseldC[0] *ldC[1];
2065
for(i=2; i<ndim; i++) {
2067
bunit[i] = bunit[i-1] * (hiC[i-1] - loC[i-1] + 1);
2068
baseldC[i] = baseldC[i-1] * ldC[i];
2073
for(i=0; i<n1dim; i++) {
2075
for(j=1; j<ndim; j++) {
2076
idx += bvalue[j] * baseldC[j-1];
2077
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2078
if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2081
for(j=0; j<(hiC[0]-loC[0]+1); j++)
2082
((double*)C_ptr)[idx+j] = ((double*)C_ptr)[idx+j]
2083
+ *(double*)alpha * ((double*)A_ptr)[idx+j];
2087
for(i=0; i<n1dim; i++) {
2089
for(j=1; j<ndim; j++) {
2090
idx += bvalue[j] * baseldC[j-1];
2091
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2092
if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2095
for(j=0; j<(hiC[0]-loC[0]+1); j++) {
2096
DoubleComplex a = ((DoubleComplex *)A_ptr)[idx+j];
2097
DoubleComplex x= *(DoubleComplex*)alpha;
2098
((DoubleComplex *)C_ptr)[idx+j].real =
2099
((DoubleComplex *)C_ptr)[idx+j].real
2100
+ x.real*a.real - x.imag*a.imag;
2101
((DoubleComplex *)C_ptr)[idx+j].imag =
2102
((DoubleComplex *)C_ptr)[idx+j].imag
2103
+ x.real*a.imag + x.imag*a.real;
2108
for(i=0; i<n1dim; i++) {
2110
for(j=1; j<ndim; j++) {
2111
idx += bvalue[j] * baseldC[j-1];
2112
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2113
if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2116
for(j=0; j<(hiC[0]-loC[0]+1); j++) {
2117
SingleComplex a = ((SingleComplex *)A_ptr)[idx+j];
2118
SingleComplex x= *(SingleComplex*)alpha;
2119
((SingleComplex *)C_ptr)[idx+j].real =
2120
((SingleComplex *)C_ptr)[idx+j].real
2121
+ x.real*a.real - x.imag*a.imag;
2122
((SingleComplex *)C_ptr)[idx+j].imag =
2123
((SingleComplex *)C_ptr)[idx+j].imag
2124
+ x.real*a.imag + x.imag*a.real;
2129
for(i=0; i<n1dim; i++) {
2131
for(j=1; j<ndim; j++) {
2132
idx += bvalue[j] * baseldC[j-1];
2133
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2134
if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2137
for(j=0; j<(hiC[0]-loC[0]+1); j++)
2138
((int*)C_ptr)[idx+j] = ((int*)C_ptr)[idx+j]
2139
+ *(int *)alpha * ((int*)A_ptr)[idx+j];
2143
for(i=0; i<n1dim; i++) {
2145
for(j=1; j<ndim; j++) {
2146
idx += bvalue[j] * baseldC[j-1];
2147
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2148
if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2151
for(j=0; j<(hiC[0]-loC[0]+1); j++)
2152
((float *)C_ptr)[idx+j] = ((float *)C_ptr)[idx+j]
2153
+ *(float *)alpha * ((float *)A_ptr)[idx+j];
2157
for(i=0; i<n1dim; i++) {
2159
for(j=1; j<ndim; j++) {
2160
idx += bvalue[j] * baseldC[j-1];
2161
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2162
if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2165
for(j=0; j<(hiC[0]-loC[0]+1); j++)
2166
((long *)C_ptr)[idx+j] = ((long *)C_ptr)[idx+j]
2167
+ *(long *)alpha * ((long *)A_ptr)[idx+j];
2171
for(i=0; i<n1dim; i++) {
2173
for(j=1; j<ndim; j++) {
2174
idx += bvalue[j] * baseldC[j-1];
2175
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2176
if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2179
for(j=0; j<(hiC[0]-loC[0]+1); j++)
2180
((long long*)C_ptr)[idx+j] = ((long long*)C_ptr)[idx+j]
2181
+ *(long long*)alpha * ((long long*)A_ptr)[idx+j];
2184
default: pnga_error(" wrong data type ",type);
2188
/*\ Utility function to add patch values together
2190
static void snga_add_patch_values(Integer type, void* alpha, void *beta,
2191
Integer ndim, Integer *loC, Integer *hiC, Integer *ldC,
2192
void *A_ptr, void *B_ptr, void *C_ptr)
2194
Integer bvalue[MAXDIM], bunit[MAXDIM], baseldC[MAXDIM];
2197
/* compute "local" add */
2199
/* number of n-element of the first dimension */
2200
n1dim = 1; for(i=1; i<ndim; i++) n1dim *= (hiC[i] - loC[i] + 1);
2202
/* calculate the destination indices */
2203
bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
2204
/* baseld[0] = ld[0]
2205
* baseld[1] = ld[0] * ld[1]
2206
* baseld[2] = ld[0] * ld[1] * ld[2] .....
2208
baseldC[0] = ldC[0]; baseldC[1] = baseldC[0] *ldC[1];
2209
for(i=2; i<ndim; i++) {
2211
bunit[i] = bunit[i-1] * (hiC[i-1] - loC[i-1] + 1);
2212
baseldC[i] = baseldC[i-1] * ldC[i];
2217
for(i=0; i<n1dim; i++) {
2219
for(j=1; j<ndim; j++) {
2220
idx += bvalue[j] * baseldC[j-1];
2221
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2222
if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2225
for(j=0; j<(hiC[0]-loC[0]+1); j++)
2226
((double*)C_ptr)[idx+j] =
2228
((double*)A_ptr)[idx+j] +
2230
((double*)B_ptr)[idx+j];
2234
for(i=0; i<n1dim; i++) {
2236
for(j=1; j<ndim; j++) {
2237
idx += bvalue[j] * baseldC[j-1];
2238
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2239
if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2242
for(j=0; j<(hiC[0]-loC[0]+1); j++) {
2243
DoubleComplex a = ((DoubleComplex *)A_ptr)[idx+j];
2244
DoubleComplex b = ((DoubleComplex *)B_ptr)[idx+j];
2245
DoubleComplex x= *(DoubleComplex*)alpha;
2246
DoubleComplex y= *(DoubleComplex*)beta;
2247
((DoubleComplex *)C_ptr)[idx+j].real = x.real*a.real -
2248
x.imag*a.imag + y.real*b.real - y.imag*b.imag;
2249
((DoubleComplex *)C_ptr)[idx+j].imag = x.real*a.imag +
2250
x.imag*a.real + y.real*b.imag + y.imag*b.real;
2255
for(i=0; i<n1dim; i++) {
2257
for(j=1; j<ndim; j++) {
2258
idx += bvalue[j] * baseldC[j-1];
2259
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2260
if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2263
for(j=0; j<(hiC[0]-loC[0]+1); j++) {
2264
SingleComplex a = ((SingleComplex *)A_ptr)[idx+j];
2265
SingleComplex b = ((SingleComplex *)B_ptr)[idx+j];
2266
SingleComplex x= *(SingleComplex*)alpha;
2267
SingleComplex y= *(SingleComplex*)beta;
2268
((SingleComplex *)C_ptr)[idx+j].real = x.real*a.real -
2269
x.imag*a.imag + y.real*b.real - y.imag*b.imag;
2270
((SingleComplex *)C_ptr)[idx+j].imag = x.real*a.imag +
2271
x.imag*a.real + y.real*b.imag + y.imag*b.real;
2276
for(i=0; i<n1dim; i++) {
2278
for(j=1; j<ndim; j++) {
2279
idx += bvalue[j] * baseldC[j-1];
2280
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2281
if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2284
for(j=0; j<(hiC[0]-loC[0]+1); j++)
2285
((int*)C_ptr)[idx+j] = *(int *)alpha *
2286
((int*)A_ptr)[idx+j] + *(int*)beta *
2287
((int*)B_ptr)[idx+j];
2291
for(i=0; i<n1dim; i++) {
2293
for(j=1; j<ndim; j++) {
2294
idx += bvalue[j] * baseldC[j-1];
2295
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2296
if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2299
for(j=0; j<(hiC[0]-loC[0]+1); j++)
2300
((float *)C_ptr)[idx+j] = *(float *)alpha *
2301
((float *)A_ptr)[idx+j] + *(float *)beta *
2302
((float *)B_ptr)[idx+j];
2306
for(i=0; i<n1dim; i++) {
2308
for(j=1; j<ndim; j++) {
2309
idx += bvalue[j] * baseldC[j-1];
2310
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2311
if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2314
for(j=0; j<(hiC[0]-loC[0]+1); j++)
2315
((long *)C_ptr)[idx+j] = *(long *)alpha *
2316
((long *)A_ptr)[idx+j] + *(long *)beta *
2317
((long *)B_ptr)[idx+j];
2321
for(i=0; i<n1dim; i++) {
2323
for(j=1; j<ndim; j++) {
2324
idx += bvalue[j] * baseldC[j-1];
2325
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2326
if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
2329
for(j=0; j<(hiC[0]-loC[0]+1); j++)
2330
((long long*)C_ptr)[idx+j] = *(long long*)alpha *
2331
((long long*)A_ptr)[idx+j] + *(long long*)beta *
2332
((long long*)B_ptr)[idx+j];
2335
default: pnga_error(" wrong data type ",type);
2340
/*\ SCALED ADDITION of two patches
2342
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
2343
# pragma weak wnga_add_patch = pnga_add_patch
2345
void pnga_add_patch(alpha, g_a, alo, ahi, beta, g_b, blo, bhi,
2347
Integer g_a, *alo, *ahi; /* patch of g_a */
2348
Integer g_b, *blo, *bhi; /* patch of g_b */
2349
Integer g_c, *clo, *chi; /* patch of g_c */
2353
Integer compatible_a, compatible_b;
2354
Integer atype, btype, ctype;
2355
Integer andim, adims[MAXDIM], bndim, bdims[MAXDIM], cndim, cdims[MAXDIM];
2356
Integer loA[MAXDIM], hiA[MAXDIM], ldA[MAXDIM];
2357
Integer loB[MAXDIM], hiB[MAXDIM], ldB[MAXDIM];
2358
Integer loC[MAXDIM], hiC[MAXDIM], ldC[MAXDIM];
2359
void *A_ptr, *B_ptr, *C_ptr;
2361
Integer atotal, btotal;
2362
Integer g_A = g_a, g_B = g_b;
2363
Integer me= pnga_nodeid(), A_created=0, B_created=0;
2364
Integer nproc = pnga_nnodes();
2365
Integer num_blocks_a, num_blocks_b, num_blocks_c;
2366
char *tempname = "temp", notrans='n';
2367
int local_sync_begin,local_sync_end;
2369
local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2370
_ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2371
if(local_sync_begin)pnga_sync();
2373
GA_PUSH_NAME("nga_add_patch");
2375
pnga_inquire(g_a, &atype, &andim, adims);
2376
pnga_inquire(g_b, &btype, &bndim, bdims);
2377
pnga_inquire(g_c, &ctype, &cndim, cdims);
2379
if(atype != btype || atype != ctype ) pnga_error(" types mismatch ", 0L);
2381
/* check if patch indices and dims match */
2382
for(i=0; i<andim; i++)
2383
if(alo[i] <= 0 || ahi[i] > adims[i])
2384
pnga_error("g_a indices out of range ", g_a);
2385
for(i=0; i<bndim; i++)
2386
if(blo[i] <= 0 || bhi[i] > bdims[i])
2387
pnga_error("g_b indices out of range ", g_b);
2388
for(i=0; i<cndim; i++)
2389
if(clo[i] <= 0 || chi[i] > cdims[i])
2390
pnga_error("g_c indices out of range ", g_c);
2392
/* check if numbers of elements in patches match each other */
2393
n1dim = 1; for(i=0; i<cndim; i++) n1dim *= (chi[i] - clo[i] + 1);
2394
atotal = 1; for(i=0; i<andim; i++) atotal *= (ahi[i] - alo[i] + 1);
2395
btotal = 1; for(i=0; i<bndim; i++) btotal *= (bhi[i] - blo[i] + 1);
2397
if((atotal != n1dim) || (btotal != n1dim))
2398
pnga_error(" capacities of patches do not match ", 0L);
2400
num_blocks_a = pnga_total_blocks(g_a);
2401
num_blocks_b = pnga_total_blocks(g_b);
2402
num_blocks_c = pnga_total_blocks(g_c);
2404
if (num_blocks_a < 0 && num_blocks_b < 0 && num_blocks_c < 0) {
2405
/* find out coordinates of patches of g_a, g_b and g_c that I own */
2406
pnga_distribution( g_A, me, loA, hiA);
2407
pnga_distribution( g_B, me, loB, hiB);
2408
pnga_distribution( g_c, me, loC, hiC);
2410
/* test if the local portion of patches matches */
2411
if(pnga_comp_patch(andim, loA, hiA, cndim, loC, hiC) &&
2412
pnga_comp_patch(andim, alo, ahi, cndim, clo, chi)) compatible_a = 1;
2413
else compatible_a = 0;
2414
pnga_gop(pnga_type_f2c(MT_F_INT), &compatible_a, 1, "*");
2415
if(pnga_comp_patch(bndim, loB, hiB, cndim, loC, hiC) &&
2416
pnga_comp_patch(bndim, blo, bhi, cndim, clo, chi)) compatible_b = 1;
2417
else compatible_b = 0;
2418
pnga_gop(pnga_type_f2c(MT_F_INT), &compatible_b, 1, "*");
2419
if (compatible_a && compatible_b) {
2420
if(andim > bndim) cndim = bndim;
2421
if(andim < bndim) cndim = andim;
2423
if(!pnga_comp_patch(andim, loA, hiA, cndim, loC, hiC))
2424
pnga_error(" A patch mismatch ", g_A);
2425
if(!pnga_comp_patch(bndim, loB, hiB, cndim, loC, hiC))
2426
pnga_error(" B patch mismatch ", g_B);
2428
/* determine subsets of my patches to access */
2429
if (pnga_patch_intersect(clo, chi, loC, hiC, cndim)){
2430
pnga_access_ptr(g_A, loC, hiC, &A_ptr, ldA);
2431
pnga_access_ptr(g_B, loC, hiC, &B_ptr, ldB);
2432
pnga_access_ptr(g_c, loC, hiC, &C_ptr, ldC);
2434
snga_add_patch_values(atype, alpha, beta, cndim,
2435
loC, hiC, ldC, A_ptr, B_ptr, C_ptr);
2437
/* release access to the data */
2438
pnga_release (g_A, loC, hiC);
2439
pnga_release (g_B, loC, hiC);
2440
pnga_release_update(g_c, loC, hiC);
2442
} else if (!compatible_a && compatible_b) {
2443
/* either patches or distributions do not match:
2444
* - create a temp array that matches distribution of g_c
2448
pnga_copy_patch(¬rans, g_a, alo, ahi, g_c, clo, chi);
2451
pnga_distribution(g_A, me, loA, hiA);
2453
if (!pnga_duplicate(g_c, &g_A, tempname))
2454
pnga_error("ga_dadd_patch: dup failed", 0L);
2455
pnga_copy_patch(¬rans, g_a, alo, ahi, g_A, clo, chi);
2458
pnga_distribution(g_A, me, loA, hiA);
2460
if(andim > bndim) cndim = bndim;
2461
if(andim < bndim) cndim = andim;
2463
if(!pnga_comp_patch(andim, loA, hiA, cndim, loC, hiC))
2464
pnga_error(" A patch mismatch ", g_A);
2465
if(!pnga_comp_patch(bndim, loB, hiB, cndim, loC, hiC))
2466
pnga_error(" B patch mismatch ", g_B);
2468
/* determine subsets of my patches to access */
2469
if (pnga_patch_intersect(clo, chi, loC, hiC, cndim)){
2470
pnga_access_ptr(g_A, loC, hiC, &A_ptr, ldA);
2471
pnga_access_ptr(g_B, loC, hiC, &B_ptr, ldB);
2472
pnga_access_ptr(g_c, loC, hiC, &C_ptr, ldC);
2474
snga_add_patch_values(atype, alpha, beta, cndim,
2475
loC, hiC, ldC, A_ptr, B_ptr, C_ptr);
2477
/* release access to the data */
2478
pnga_release (g_A, loC, hiC);
2479
pnga_release (g_B, loC, hiC);
2480
pnga_release_update(g_c, loC, hiC);
2482
} else if (compatible_a && !compatible_b) {
2483
/* either patches or distributions do not match:
2484
* - create a temp array that matches distribution of g_c
2485
* - copy & reshape patch of g_b into g_B
2487
if (!pnga_duplicate(g_c, &g_B, tempname))
2488
pnga_error("ga_dadd_patch: dup failed", 0L);
2489
pnga_copy_patch(¬rans, g_b, blo, bhi, g_B, clo, chi);
2492
pnga_distribution(g_B, me, loB, hiB);
2494
if(andim > bndim) cndim = bndim;
2495
if(andim < bndim) cndim = andim;
2497
if(!pnga_comp_patch(andim, loA, hiA, cndim, loC, hiC))
2498
pnga_error(" A patch mismatch ", g_A);
2499
if(!pnga_comp_patch(bndim, loB, hiB, cndim, loC, hiC))
2500
pnga_error(" B patch mismatch ", g_B);
2502
/* determine subsets of my patches to access */
2503
if (pnga_patch_intersect(clo, chi, loC, hiC, cndim)){
2504
pnga_access_ptr(g_A, loC, hiC, &A_ptr, ldA);
2505
pnga_access_ptr(g_B, loC, hiC, &B_ptr, ldB);
2506
pnga_access_ptr(g_c, loC, hiC, &C_ptr, ldC);
2508
snga_add_patch_values(atype, alpha, beta, cndim,
2509
loC, hiC, ldC, A_ptr, B_ptr, C_ptr);
2511
/* release access to the data */
2512
pnga_release (g_A, loC, hiC);
2513
pnga_release (g_B, loC, hiC);
2514
pnga_release_update(g_c, loC, hiC);
2516
} else if (!compatible_a && !compatible_b) {
2517
/* there is no match between any of the global arrays */
2518
if (!pnga_duplicate(g_c, &g_B, tempname))
2519
pnga_error("ga_dadd_patch: dup failed", 0L);
2520
pnga_copy_patch(¬rans, g_b, blo, bhi, g_B, clo, chi);
2523
if(andim > bndim) cndim = bndim;
2524
if(andim < bndim) cndim = andim;
2526
pnga_copy_patch(¬rans, g_a, alo, ahi, g_c, clo, chi);
2527
pnga_scale_patch(g_c, clo, chi, alpha);
2528
/* determine subsets of my patches to access */
2529
if (pnga_patch_intersect(clo, chi, loC, hiC, cndim)){
2530
pnga_access_ptr(g_B, loC, hiC, &B_ptr, ldB);
2531
pnga_access_ptr(g_c, loC, hiC, &C_ptr, ldC);
2533
snga_acc_patch_values(atype, beta, cndim, loC, hiC, ldC,
2535
/* release access to the data */
2536
pnga_release (g_B, loC, hiC);
2537
pnga_release_update(g_c, loC, hiC);
2541
/* create copies of arrays A and B that are identically distributed
2543
if (!pnga_duplicate(g_c, &g_A, tempname))
2544
pnga_error("ga_dadd_patch: dup failed", 0L);
2545
pnga_copy_patch(¬rans, g_a, alo, ahi, g_A, clo, chi);
2549
if (!pnga_duplicate(g_c, &g_B, tempname))
2550
pnga_error("ga_dadd_patch: dup failed", 0L);
2551
pnga_copy_patch(¬rans, g_b, blo, bhi, g_B, clo, chi);
2555
/* C is normally distributed so just add copies together for regular
2557
if (num_blocks_c < 0) {
2558
pnga_distribution(g_c, me, loC, hiC);
2559
if(andim > bndim) cndim = bndim;
2560
if(andim < bndim) cndim = andim;
2561
if (pnga_patch_intersect(clo, chi, loC, hiC, cndim)){
2562
pnga_access_ptr(g_A, loC, hiC, &A_ptr, ldA);
2563
pnga_access_ptr(g_B, loC, hiC, &B_ptr, ldB);
2564
pnga_access_ptr(g_c, loC, hiC, &C_ptr, ldC);
2566
snga_add_patch_values(atype, alpha, beta, cndim,
2567
loC, hiC, ldC, A_ptr, B_ptr, C_ptr);
2569
/* release access to the data */
2570
pnga_release (g_A, loC, hiC);
2571
pnga_release (g_B, loC, hiC);
2572
pnga_release_update(g_c, loC, hiC);
2575
Integer idx, lod[MAXDIM]/*, hid[MAXDIM]*/;
2576
Integer offset, jtot, last;
2577
/* Simple block-cyclic data disribution */
2578
if (!pnga_uses_proc_grid(g_c)) {
2579
for (idx = me; idx < num_blocks_c; idx += nproc) {
2580
pnga_distribution(g_c, idx, loC, hiC);
2581
/* make temporary copies of loC and hiC since pnga_patch_intersect
2582
destroys original versions */
2583
for (j=0; j<cndim; j++) {
2585
/*hid[j] = hiC[j];*/
2587
if (pnga_patch_intersect(clo, chi, loC, hiC, cndim)) {
2588
pnga_access_block_ptr(g_A, idx, &A_ptr, ldA);
2589
pnga_access_block_ptr(g_B, idx, &B_ptr, ldB);
2590
pnga_access_block_ptr(g_c, idx, &C_ptr, ldC);
2592
/* evaluate offsets for system */
2596
for (j=0; j<last; j++) {
2597
offset += (loC[j] - lod[j])*jtot;
2600
offset += (loC[last]-lod[last])*jtot;
2604
A_ptr = (void*)((double*)(A_ptr) + offset);
2605
B_ptr = (void*)((double*)(B_ptr) + offset);
2606
C_ptr = (void*)((double*)(C_ptr) + offset);
2609
A_ptr = (void*)((int*)(A_ptr) + offset);
2610
B_ptr = (void*)((int*)(B_ptr) + offset);
2611
C_ptr = (void*)((int*)(C_ptr) + offset);
2614
A_ptr = (void*)((DoubleComplex*)(A_ptr) + offset);
2615
B_ptr = (void*)((DoubleComplex*)(B_ptr) + offset);
2616
C_ptr = (void*)((DoubleComplex*)(C_ptr) + offset);
2619
A_ptr = (void*)((SingleComplex*)(A_ptr) + offset);
2620
B_ptr = (void*)((SingleComplex*)(B_ptr) + offset);
2621
C_ptr = (void*)((SingleComplex*)(C_ptr) + offset);
2624
A_ptr = (void*)((float*)(A_ptr) + offset);
2625
B_ptr = (void*)((float*)(B_ptr) + offset);
2626
C_ptr = (void*)((float*)(C_ptr) + offset);
2629
A_ptr = (void*)((long*)(A_ptr) + offset);
2630
B_ptr = (void*)((long*)(B_ptr) + offset);
2631
C_ptr = (void*)((long*)(C_ptr) + offset);
2634
A_ptr = (void*)((long long*)(A_ptr) + offset);
2635
B_ptr = (void*)((long long*)(B_ptr) + offset);
2636
C_ptr = (void*)((long long*)(C_ptr) + offset);
2641
snga_add_patch_values(atype, alpha, beta, cndim,
2642
loC, hiC, ldC, A_ptr, B_ptr, C_ptr);
2644
/* release access to the data */
2645
pnga_release_block (g_A, idx);
2646
pnga_release_block (g_B, idx);
2647
pnga_release_update_block(g_c, idx);
2651
/* Uses scalapack block-cyclic data distribution */
2652
Integer lod[MAXDIM]/*, hid[MAXDIM], chk*/;
2653
Integer proc_index[MAXDIM], index[MAXDIM];
2654
Integer topology[MAXDIM];
2655
Integer blocks[MAXDIM], block_dims[MAXDIM];
2656
pnga_get_proc_index(g_c, me, proc_index);
2657
pnga_get_proc_index(g_c, me, index);
2658
pnga_get_block_info(g_c, blocks, block_dims);
2659
pnga_get_proc_grid(g_c, topology);
2660
while (index[cndim-1] < blocks[cndim-1]) {
2661
/* find bounding coordinates of block */
2663
for (i = 0; i < cndim; i++) {
2664
loC[i] = index[i]*block_dims[i]+1;
2665
hiC[i] = (index[i] + 1)*block_dims[i];
2666
if (hiC[i] > cdims[i]) hiC[i] = cdims[i];
2667
/*if (hiC[i] < loC[i]) chk = 0;*/
2669
/* make temporary copies of loC and hiC since pnga_patch_intersect
2670
destroys original versions */
2671
for (j=0; j<cndim; j++) {
2673
/*hid[j] = hiC[j];*/
2675
if (pnga_patch_intersect(clo, chi, loC, hiC, cndim)) {
2676
pnga_access_block_grid_ptr(g_A, index, &A_ptr, ldA);
2677
pnga_access_block_grid_ptr(g_B, index, &B_ptr, ldB);
2678
pnga_access_block_grid_ptr(g_c, index, &C_ptr, ldC);
2680
/* evaluate offsets for system */
2684
for (j=0; j<last; j++) {
2685
offset += (loC[j] - lod[j])*jtot;
2688
offset += (loC[last]-lod[last])*jtot;
2692
A_ptr = (void*)((double*)(A_ptr) + offset);
2693
B_ptr = (void*)((double*)(B_ptr) + offset);
2694
C_ptr = (void*)((double*)(C_ptr) + offset);
2697
A_ptr = (void*)((int*)(A_ptr) + offset);
2698
B_ptr = (void*)((int*)(B_ptr) + offset);
2699
C_ptr = (void*)((int*)(C_ptr) + offset);
2702
A_ptr = (void*)((DoubleComplex*)(A_ptr) + offset);
2703
B_ptr = (void*)((DoubleComplex*)(B_ptr) + offset);
2704
C_ptr = (void*)((DoubleComplex*)(C_ptr) + offset);
2707
A_ptr = (void*)((SingleComplex*)(A_ptr) + offset);
2708
B_ptr = (void*)((SingleComplex*)(B_ptr) + offset);
2709
C_ptr = (void*)((SingleComplex*)(C_ptr) + offset);
2712
A_ptr = (void*)((float*)(A_ptr) + offset);
2713
B_ptr = (void*)((float*)(B_ptr) + offset);
2714
C_ptr = (void*)((float*)(C_ptr) + offset);
2717
A_ptr = (void*)((long*)(A_ptr) + offset);
2718
B_ptr = (void*)((long*)(B_ptr) + offset);
2719
C_ptr = (void*)((long*)(C_ptr) + offset);
2722
A_ptr = (void*)((long long*)(A_ptr) + offset);
2723
B_ptr = (void*)((long long*)(B_ptr) + offset);
2724
C_ptr = (void*)((long long*)(C_ptr) + offset);
2729
snga_add_patch_values(atype, alpha, beta, cndim,
2730
loC, hiC, ldC, A_ptr, B_ptr, C_ptr);
2732
/* release access to the data */
2733
pnga_release_block_grid ( g_A, index);
2734
pnga_release_block_grid ( g_B, index);
2735
pnga_release_update_block_grid(g_c, index);
2738
/* increment index to get next block on processor */
2739
index[0] += topology[0];
2740
for (i = 0; i < cndim; i++) {
2741
if (index[i] >= blocks[i] && i<cndim-1) {
2742
index[i] = proc_index[i];
2743
index[i+1] += topology[i+1];
2751
if(A_created) pnga_destroy(g_A);
2752
if(B_created) pnga_destroy(g_B);
2755
if(local_sync_end)pnga_sync();
2759
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
2760
# pragma weak wnga_zero_patch = pnga_zero_patch
2762
void pnga_zero_patch(Integer g_a, Integer *lo, Integer *hi)
2764
Integer ndim, dims[MAXDIM], type;
2770
SingleComplex cfval;
2772
void *valptr = NULL;
2773
int local_sync_begin,local_sync_end;
2775
local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2776
_ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2777
if(local_sync_begin)pnga_sync();
2779
GA_PUSH_NAME("nga_zero_patch");
2781
pnga_inquire(g_a, &type, &ndim, dims);
2785
valptr = (void *)(&ival);
2788
valptr = (void *)(&dval);
2792
cval.real = 0.0; cval.imag = 0.0;
2793
valptr = (void *)(&cval);
2798
cfval.real = 0.0; cfval.imag = 0.0;
2799
valptr = (void *)(&cfval);
2803
valptr = (void *)(&fval);
2806
valptr = (void *)(&lval);
2809
valptr = (void *)(&llval);
2811
default: pnga_error(" wrong data type ",type);
2813
pnga_fill_patch(g_a, lo, hi, valptr);
2816
if(local_sync_end)pnga_sync();