5
/*************************************************************************\
6
Purpose: File global.nalg.c contains a set of linear algebra routines
7
that operate on n-dim global arrays in the SPMD mode.
10
Author: Jarek Nieplocha
11
\************************************************************************/
27
extern ARMCI_Group* ga_get_armci_group_(int);
30
/* work arrays used in all routines */
31
static Integer dims[MAXDIM], ld[MAXDIM-1];
32
static Integer lo[MAXDIM],hi[MAXDIM];
33
static Integer one_arr[MAXDIM]={1,1,1,1,1,1,1};
35
#define GET_ELEMS(ndim,lo,hi,ld,pelems){\
37
for(_i=0, *pelems = hi[ndim-1]-lo[ndim-1]+1; _i< ndim-1;_i++) {\
38
if(ld[_i] != (hi[_i]-lo[_i]+1)) pnga_error("layout problem",_i);\
39
*pelems *= hi[_i]-lo[_i]+1;\
43
#define GET_ELEMS_W_GHOSTS(ndim,lo,hi,ld,pelems){\
45
for(_i=0, *pelems = hi[ndim-1]-lo[ndim-1]+1; _i< ndim-1;_i++) {\
46
if(ld[_i] < (hi[_i]-lo[_i]+1))\
47
pnga_error("layout problem with ghosts",_i);\
48
*pelems *= hi[_i]-lo[_i]+1;\
53
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
54
# pragma weak wnga_zero = pnga_zero
56
void pnga_zero(Integer g_a)
58
Integer ndim, type, me, elems, p_handle;
61
/*register Integer i;*/
62
int local_sync_begin,local_sync_end;
64
local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
65
_ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
66
p_handle = pnga_get_pgroup(g_a);
68
if(local_sync_begin) pnga_pgroup_sync(p_handle);
70
me = pnga_pgroup_nodeid(p_handle);
72
pnga_check_handle(g_a, "ga_zero");
73
GA_PUSH_NAME("ga_zero");
75
num_blocks = pnga_total_blocks(g_a);
77
pnga_inquire(g_a, &type, &ndim, dims);
79
pnga_distribution(g_a, me, lo, hi);
81
if ( lo[0]> 0 ){ /* base index is 1: we get 0 if no elements stored on p */
83
if (pnga_has_ghosts(g_a)) {
84
pnga_zero_patch(g_a,lo,hi);
87
pnga_access_ptr(g_a, lo, hi, &ptr, ld);
88
GET_ELEMS(ndim,lo,hi,ld,&elems);
98
/* for(i=0;i<elems;i++) ia[i] = 0; */
103
/* da = (double*)ptr; */
104
/* for(i=0;i<elems;i++) da[i] = 0; */
109
/* fa = (float*)ptr; */
110
/* for(i=0;i<elems;i++) fa[i] = 0; */
113
/* la = (long*)ptr; */
114
/* for(i=0;i<elems;i++) la[i] = 0; */
116
/* case C_LONGLONG: */
117
/* lla = (long long*)ptr; */
118
/* for(i=0;i<elems;i++) lla[i] = 0; */
120
/* default: pnga_error(" wrong data type ",type); */
122
memset(ptr, 0, GAsizeofM(type)*elems);
124
/* release access to the data */
125
pnga_release_update(g_a, lo, hi);
128
pnga_access_block_segment_ptr(g_a, me, &ptr, &elems);
134
/* long long *lla; */
136
/* ia = (int*)ptr; */
137
/* for(i=0;i<elems;i++) ia[i] = 0; */
142
/* da = (double*)ptr; */
143
/* for(i=0;i<elems;i++) da[i] = 0; */
148
/* fa = (float*)ptr; */
149
/* for(i=0;i<elems;i++) fa[i] = 0; */
152
/* la = (long*)ptr; */
153
/* for(i=0;i<elems;i++) la[i] = 0; */
155
/* case C_LONGLONG: */
156
/* lla = (long long*)ptr; */
157
/* for(i=0;i<elems;i++) lla[i] = 0; */
159
/* default: pnga_error(" wrong data type ",type); */
161
memset(ptr, 0, GAsizeofM(type)*elems);
163
/* release access to the data */
164
pnga_release_update_block_segment(g_a, me);
166
if(local_sync_end)pnga_pgroup_sync(p_handle);
173
/*\ COPY ONE GLOBAL ARRAY INTO ANOTHER
175
static void snga_copy_old(Integer g_a, Integer g_b)
177
Integer ndim, ndimb, type, typeb, me, elems=0, elemsb=0;
178
Integer dimsb[MAXDIM];
183
GA_PUSH_NAME("ga_copy");
185
if(g_a == g_b) pnga_error("arrays have to be different ", 0L);
187
pnga_inquire(g_a, &type, &ndim, dims);
188
pnga_inquire(g_b, &typeb, &ndimb, dimsb);
190
if(type != typeb) pnga_error("types not the same", g_b);
192
if(!pnga_compare_distr(g_a,g_b))
194
pnga_copy_patch("n",g_a, one_arr, dims, g_b, one_arr, dimsb);
200
pnga_distribution(g_a, me, lo, hi);
202
pnga_access_ptr(g_a, lo, hi, &ptr_a, ld);
203
if (pnga_has_ghosts(g_a)) {
204
GET_ELEMS_W_GHOSTS(ndim,lo,hi,ld,&elems);
206
GET_ELEMS(ndim,lo,hi,ld,&elems);
210
pnga_distribution(g_b, me, lo, hi);
212
pnga_access_ptr(g_b, lo, hi, &ptr_b, ld);
213
if (pnga_has_ghosts(g_b)) {
214
GET_ELEMS_W_GHOSTS(ndim,lo,hi,ld,&elems);
216
GET_ELEMS(ndim,lo,hi,ld,&elems);
220
if(elems!= elemsb)pnga_error("inconsistent number of elements",elems-elemsb);
223
ARMCI_Copy(ptr_a, ptr_b, (int)elems*GAsizeofM(type));
224
pnga_release(g_a,lo,hi);
225
pnga_release(g_b,lo,hi);
237
/*\ COPY ONE GLOBAL ARRAY INTO ANOTHER
239
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
240
# pragma weak wnga_copy = pnga_copy
242
void pnga_copy(Integer g_a, Integer g_b)
244
Integer ndim, ndimb, type, typeb, me_a, me_b;
245
Integer dimsb[MAXDIM],i;
246
Integer a_grp, b_grp, anproc, bnproc;
247
Integer num_blocks_a, num_blocks_b;
248
Integer blocks[MAXDIM], block_dims[MAXDIM];
250
int local_sync_begin,local_sync_end,use_put;
252
GA_PUSH_NAME("ga_copy");
254
local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
255
_ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
256
a_grp = pnga_get_pgroup(g_a);
257
b_grp = pnga_get_pgroup(g_b);
258
me_a = pnga_pgroup_nodeid(a_grp);
259
me_b = pnga_pgroup_nodeid(b_grp);
260
anproc = pnga_get_pgroup_size(a_grp);
261
bnproc = pnga_get_pgroup_size(b_grp);
262
num_blocks_a = pnga_total_blocks(g_a);
263
num_blocks_b = pnga_total_blocks(g_b);
264
if (anproc <= bnproc) {
269
/*if (a_grp != b_grp)
270
pnga_error("Both arrays must be defined on same group",0L); */
271
if(local_sync_begin) {
272
if (anproc <= bnproc) {
273
pnga_pgroup_sync(a_grp);
274
} else if (a_grp == pnga_pgroup_get_world() &&
275
b_grp == pnga_pgroup_get_world()) {
278
pnga_pgroup_sync(b_grp);
282
if(g_a == g_b) pnga_error("arrays have to be different ", 0L);
284
pnga_inquire(g_a, &type, &ndim, dims);
285
pnga_inquire(g_b, &typeb, &ndimb, dimsb);
287
if(type != typeb) pnga_error("types not the same", g_b);
288
if(ndim != ndimb) pnga_error("dimensions not the same", ndimb);
290
for(i=0; i< ndim; i++)if(dims[i]!=dimsb[i])
291
pnga_error("dimensions not the same",i);
293
if ((pnga_is_mirrored(g_a) && pnga_is_mirrored(g_b)) ||
294
(!pnga_is_mirrored(g_a) && !pnga_is_mirrored(g_b))) {
295
/* Both global arrays are mirrored or both global arrays are not mirrored.
296
Copy operation is straightforward */
299
if (num_blocks_a < 0) {
300
pnga_distribution(g_a, me_a, lo, hi);
302
pnga_access_ptr(g_a, lo, hi, &ptr_a, ld);
303
pnga_put(g_b, lo, hi, ptr_a, ld);
306
if (!pnga_uses_proc_grid(g_a)) {
307
for (i=me_a; i<num_blocks_a; i += anproc) {
308
pnga_distribution(g_a, i, lo, hi);
310
pnga_access_block_ptr(g_a, i, &ptr_a, ld);
311
pnga_put(g_b, lo, hi, ptr_a, ld);
315
Integer proc_index[MAXDIM], index[MAXDIM];
316
Integer topology[MAXDIM], chk;
317
pnga_get_proc_index(g_a, me_a, proc_index);
318
pnga_get_proc_index(g_a, me_a, index);
319
pnga_get_block_info(g_a, blocks, block_dims);
320
pnga_get_proc_grid(g_a, topology);
321
while (index[ndim-1] < blocks[ndim-1]) {
322
/* find bounding coordinates of block */
324
for (i = 0; i < ndim; i++) {
325
lo[i] = index[i]*block_dims[i]+1;
326
hi[i] = (index[i] + 1)*block_dims[i];
327
if (hi[i] > dims[i]) hi[i] = dims[i];
328
if (hi[i] < lo[i]) chk = 0;
331
pnga_access_block_grid_ptr(g_a, index, &ptr_a, ld);
332
pnga_put(g_b, lo, hi, ptr_a, ld);
334
/* increment index to get next block on processor */
335
index[0] += topology[0];
336
for (i = 0; i < ndim; i++) {
337
if (index[i] >= blocks[i] && i<ndim-1) {
338
index[i] = proc_index[i];
339
index[i+1] += topology[i+1];
346
if (num_blocks_b < 0) {
347
pnga_distribution(g_b, me_b, lo, hi);
349
pnga_access_ptr(g_b, lo, hi, &ptr_b, ld);
350
pnga_get(g_a, lo, hi, ptr_b, ld);
353
if (!pnga_uses_proc_grid(g_a)) {
354
for (i=me_b; i<num_blocks_b; i += bnproc) {
355
pnga_distribution(g_b, i, lo, hi);
357
pnga_access_block_ptr(g_b, i, &ptr_b, ld);
358
pnga_get(g_a, lo, hi, ptr_b, ld);
362
Integer proc_index[MAXDIM], index[MAXDIM];
363
Integer topology[MAXDIM], chk;
364
pnga_get_proc_index(g_b, me_b, proc_index);
365
pnga_get_proc_index(g_b, me_b, index);
366
pnga_get_block_info(g_b, blocks, block_dims);
367
pnga_get_proc_grid(g_b, topology);
368
while (index[ndim-1] < blocks[ndim-1]) {
369
/* find bounding coordinates of block */
371
for (i = 0; i < ndim; i++) {
372
lo[i] = index[i]*block_dims[i]+1;
373
hi[i] = (index[i] + 1)*block_dims[i];
374
if (hi[i] > dims[i]) hi[i] = dims[i];
375
if (hi[i] < lo[i]) chk = 0;
378
pnga_access_block_grid_ptr(g_b, index, &ptr_b, ld);
379
pnga_get(g_a, lo, hi, ptr_b, ld);
381
/* increment index to get next block on processor */
382
index[0] += topology[0];
383
for (i = 0; i < ndim; i++) {
384
if (index[i] >= blocks[i] && i<ndim-1) {
385
index[i] = proc_index[i];
386
index[i+1] += topology[i+1];
394
/* One global array is mirrored and the other is not */
395
if (pnga_is_mirrored(g_a)) {
396
/* Source array is mirrored and destination
397
array is distributed. Assume source array is consistent */
398
pnga_distribution(g_b, me_b, lo, hi);
400
pnga_access_ptr(g_b, lo, hi, &ptr_b, ld);
401
pnga_get(g_a, lo, hi, ptr_b, ld);
404
/* source array is distributed and destination
407
pnga_distribution(g_a, me_a, lo, hi);
409
pnga_access_ptr(g_a, lo, hi, &ptr_a, ld);
410
pnga_put(g_b, lo, hi, ptr_a, ld);
412
pnga_merge_mirrored(g_b);
417
if (anproc <= bnproc) {
418
pnga_pgroup_sync(a_grp);
419
} else if (a_grp == pnga_pgroup_get_world() &&
420
b_grp == pnga_pgroup_get_world()) {
423
pnga_pgroup_sync(b_grp);
431
/*\ internal version of dot product
433
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
434
# pragma weak wnga_dot = pnga_dot
436
void pnga_dot(int Type, Integer g_a, Integer g_b, void *value)
438
Integer ndim=0, type=0, atype=0, me=0, elems=0, elemsb=0;
439
register Integer i=0;
443
DoubleComplex zsum ={0.,0.};
444
SingleComplex csum ={0.,0.};
446
void *ptr_a=NULL, *ptr_b=NULL;
448
Integer a_grp=0, b_grp=0;
449
Integer num_blocks_a=0, num_blocks_b=0;
451
Integer andim, adims[MAXDIM];
452
Integer bndim, bdims[MAXDIM];
454
_ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
456
GA_PUSH_NAME("ga_dot");
457
a_grp = pnga_get_pgroup(g_a);
458
b_grp = pnga_get_pgroup(g_b);
460
pnga_error("Both arrays must be defined on same group",0L);
461
me = pnga_pgroup_nodeid(a_grp);
463
/* Check to see if either GA is block cyclic distributed */
464
num_blocks_a = pnga_total_blocks(g_a);
465
num_blocks_b = pnga_total_blocks(g_b);
466
if (num_blocks_a >= 0 || num_blocks_b >= 0) {
467
pnga_inquire(g_a, &type, &andim, adims);
468
pnga_inquire(g_b, &type, &bndim, bdims);
469
pnga_dot_patch(g_a, "n", one_arr, adims, g_b, "n", one_arr, bdims,
475
if(pnga_compare_distr(g_a,g_b) == FALSE ||
476
pnga_has_ghosts(g_a) || pnga_has_ghosts(g_b)) {
477
/* distributions not identical */
478
pnga_inquire(g_a, &type, &andim, adims);
479
pnga_inquire(g_b, &type, &bndim, bdims);
481
pnga_dot_patch(g_a, "n", one_arr, adims, g_b, "n", one_arr, bdims,
488
pnga_pgroup_sync(a_grp);
489
pnga_inquire(g_a, &type, &ndim, dims);
490
if(type != Type) pnga_error("type not correct", g_a);
491
pnga_distribution(g_a, me, lo, hi);
493
pnga_access_ptr(g_a, lo, hi, &ptr_a, ld);
494
if (pnga_has_ghosts(g_a)) {
495
GET_ELEMS_W_GHOSTS(ndim,lo,hi,ld,&elems);
497
GET_ELEMS(ndim,lo,hi,ld,&elems);
505
pnga_inquire(g_b, &type, &ndim, dims);
506
if(type != Type) pnga_error("type not correct", g_b);
507
pnga_distribution(g_b, me, lo, hi);
509
pnga_access_ptr(g_b, lo, hi, &ptr_b, ld);
510
if (pnga_has_ghosts(g_b)) {
511
GET_ELEMS_W_GHOSTS(ndim,lo,hi,ld,&elemsb);
513
GET_ELEMS(ndim,lo,hi,ld,&elemsb);
518
if(elems!= elemsb)pnga_error("inconsistent number of elements",elems-elemsb);
521
/* compute "local" contribution to the dot product */
532
isum += ia[i] * ib[i];
539
for(i=0;i<elems;i++){
540
DoubleComplex a = ((DoubleComplex*)ptr_a)[i];
541
DoubleComplex b = ((DoubleComplex*)ptr_b)[i];
542
zsum.real += a.real*b.real - b.imag * a.imag;
543
zsum.imag += a.imag*b.real + b.imag * a.real;
545
*(DoubleComplex*)value = zsum;
551
for(i=0;i<elems;i++){
552
SingleComplex a = ((SingleComplex*)ptr_a)[i];
553
SingleComplex b = ((SingleComplex*)ptr_b)[i];
554
csum.real += a.real*b.real - b.imag * a.imag;
555
csum.imag += a.imag*b.real + b.imag * a.real;
557
*(SingleComplex*)value = csum;
566
zsum.real += da[i] * db[i];
567
*(double*)value = zsum.real;
575
fsum += fa[i] * fb[i];
576
*(float*)value = fsum;
584
lsum += la[i] * lb[i];
585
*(long*)value = lsum;
590
lla = (long long*)ptr_a;
591
llb = (long long*)ptr_b;
593
llsum += lla[i] * llb[i];
594
*(long long*)value = llsum;
598
default: pnga_error(" wrong data type ",type);
601
/* release access to the data */
603
pnga_release(g_a, lo, hi);
604
if(g_a != g_b)pnga_release(g_b, lo, hi);
607
/*convert from C data type to ARMCI type */
609
case C_FLOAT: atype=ARMCI_FLOAT; break;
610
case C_DBL: atype=ARMCI_DOUBLE; break;
611
case C_INT: atype=ARMCI_INT; break;
612
case C_LONG: atype=ARMCI_LONG; break;
613
case C_LONGLONG: atype=ARMCI_LONG_LONG; break;
614
case C_DCPL: atype=ARMCI_DOUBLE; break;
615
case C_SCPL: atype=ARMCI_FLOAT; break;
616
default: pnga_error("pnga_dot: type not supported",type);
619
if (pnga_is_mirrored(g_a) && pnga_is_mirrored(g_b)) {
620
armci_msg_gop_scope(SCOPE_NODE,value,alen,"+",atype);
623
armci_msg_gop_scope(SCOPE_ALL,value,alen,"+",atype);
626
armci_msg_group_gop_scope(SCOPE_ALL,value,alen,"+",atype,
627
ga_get_armci_group_((int)a_grp));
637
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
638
# pragma weak wnga_scale = pnga_scale
640
void pnga_scale(Integer g_a, void* alpha)
642
Integer ndim, type, me, elems, grp_id;
646
int local_sync_begin,local_sync_end;
648
local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
649
_ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
650
grp_id = pnga_get_pgroup(g_a);
651
if(local_sync_begin)pnga_pgroup_sync(grp_id);
653
me = pnga_pgroup_nodeid(grp_id);
655
pnga_check_handle(g_a, "ga_scale");
656
GA_PUSH_NAME("ga_scale");
657
num_blocks = pnga_total_blocks(g_a);
659
pnga_inquire(g_a, &type, &ndim, dims);
660
if (num_blocks < 0) {
661
pnga_distribution(g_a, me, lo, hi);
662
if (pnga_has_ghosts(g_a)) {
663
pnga_scale_patch(g_a, lo, hi, alpha);
667
if ( lo[0]> 0 ){ /* base index is 1: we get 0 if no elements stored on p */
669
pnga_access_ptr(g_a, lo, hi, &ptr, ld);
670
GET_ELEMS(ndim,lo,hi,ld,&elems);
675
DoubleComplex *ca, scale;
676
SingleComplex *cfa, cfscale;
682
for(i=0;i<elems;i++) ia[i] *= *(int*)alpha;
686
for(i=0;i<elems;i++) la[i] *= *(long*)alpha;
689
lla = (long long*)ptr;
690
for(i=0;i<elems;i++) lla[i] *= *(long long*)alpha;
693
ca = (DoubleComplex*)ptr;
694
scale= *(DoubleComplex*)alpha;
695
for(i=0;i<elems;i++){
696
DoubleComplex val = ca[i];
697
ca[i].real = scale.real*val.real - val.imag * scale.imag;
698
ca[i].imag = scale.imag*val.real + val.imag * scale.real;
702
cfa = (SingleComplex*)ptr;
703
cfscale= *(SingleComplex*)alpha;
704
for(i=0;i<elems;i++){
705
SingleComplex val = cfa[i];
706
cfa[i].real = cfscale.real*val.real - val.imag * cfscale.imag;
707
cfa[i].imag = cfscale.imag*val.real + val.imag * cfscale.real;
712
for(i=0;i<elems;i++) da[i] *= *(double*)alpha;
716
for(i=0;i<elems;i++) fa[i] *= *(float*)alpha;
718
default: pnga_error(" wrong data type ",type);
721
/* release access to the data */
722
pnga_release_update(g_a, lo, hi);
725
pnga_access_block_segment_ptr(g_a, me, &ptr, &elems);
729
DoubleComplex *ca, scale;
730
SingleComplex *cfa, cfscale;
736
for(i=0;i<elems;i++) ia[i] *= *(int*)alpha;
740
for(i=0;i<elems;i++) la[i] *= *(long*)alpha;
743
lla = (long long*)ptr;
744
for(i=0;i<elems;i++) lla[i] *= *(long long*)alpha;
747
ca = (DoubleComplex*)ptr;
748
scale= *(DoubleComplex*)alpha;
749
for(i=0;i<elems;i++){
750
DoubleComplex val = ca[i];
751
ca[i].real = scale.real*val.real - val.imag * scale.imag;
752
ca[i].imag = scale.imag*val.real + val.imag * scale.real;
756
cfa = (SingleComplex*)ptr;
757
cfscale= *(SingleComplex*)alpha;
758
for(i=0;i<elems;i++){
759
SingleComplex val = cfa[i];
760
cfa[i].real = cfscale.real*val.real - val.imag * cfscale.imag;
761
cfa[i].imag = cfscale.imag*val.real + val.imag * cfscale.real;
766
for(i=0;i<elems;i++) da[i] *= *(double*)alpha;
770
for(i=0;i<elems;i++) fa[i] *= *(float*)alpha;
772
default: pnga_error(" wrong data type ",type);
774
/* release access to the data */
775
pnga_release_update_block_segment(g_a, me);
778
if(local_sync_end)pnga_pgroup_sync(grp_id);
782
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
783
# pragma weak wnga_add = pnga_add
785
void pnga_add(void *alpha, Integer g_a, void* beta, Integer g_b, Integer g_c)
787
Integer ndim, type, typeC, me, elems=0, elemsb=0, elemsa=0;
789
void *ptr_a, *ptr_b, *ptr_c;
790
Integer a_grp, b_grp, c_grp;
791
int local_sync_begin,local_sync_end;
793
Integer andim, adims[MAXDIM];
794
Integer bndim, bdims[MAXDIM];
795
Integer cndim, cdims[MAXDIM];
797
local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
798
_ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
800
GA_PUSH_NAME("ga_add");
801
a_grp = pnga_get_pgroup(g_a);
802
b_grp = pnga_get_pgroup(g_b);
803
c_grp = pnga_get_pgroup(g_c);
804
if (a_grp != b_grp || b_grp != c_grp)
805
pnga_error("All three arrays must be on same group for ga_add",0L);
807
if(local_sync_begin)pnga_pgroup_sync(a_grp);
809
me = pnga_pgroup_nodeid(a_grp);
810
if((pnga_compare_distr(g_a,g_b) == FALSE) ||
811
(pnga_compare_distr(g_a,g_c) == FALSE) ||
812
pnga_has_ghosts(g_a) || pnga_has_ghosts(g_b) || pnga_has_ghosts(g_c) ||
813
pnga_total_blocks(g_a) > 0 || pnga_total_blocks(g_b) > 0 ||
814
pnga_total_blocks(g_c) > 0) {
815
/* distributions not identical */
816
pnga_inquire(g_a, &type, &andim, adims);
817
pnga_inquire(g_b, &type, &bndim, bdims);
818
pnga_inquire(g_b, &type, &cndim, cdims);
820
pnga_add_patch(alpha, g_a, one_arr, adims, beta, g_b, one_arr, bdims,
821
g_c, one_arr, cdims);
827
pnga_pgroup_sync(a_grp);
828
pnga_inquire(g_c, &typeC, &ndim, dims);
829
pnga_distribution(g_c, me, lo, hi);
831
pnga_access_ptr(g_c, lo, hi, &ptr_c, ld);
832
GET_ELEMS(ndim,lo,hi,ld,&elems);
839
pnga_inquire(g_a, &type, &ndim, dims);
840
if(type != typeC) pnga_error("types not consistent", g_a);
841
pnga_distribution(g_a, me, lo, hi);
843
pnga_access_ptr(g_a, lo, hi, &ptr_a, ld);
844
GET_ELEMS(ndim,lo,hi,ld,&elemsa);
852
pnga_inquire(g_b, &type, &ndim, dims);
853
if(type != typeC) pnga_error("types not consistent", g_b);
854
pnga_distribution(g_b, me, lo, hi);
856
pnga_access_ptr(g_b, lo, hi, &ptr_b, ld);
857
GET_ELEMS(ndim,lo,hi,ld,&elemsb);
861
if(elems!= elemsb)pnga_error("inconsistent number of elements a",elems-elemsb);
862
if(elems!= elemsa)pnga_error("inconsistent number of elements b",elems-elemsa);
866
/* operation on the "local" piece of data */
872
long long *lla,*llb,*llc;
877
for(i=0; i<elems; i++)
878
dc[i] = *(double*)alpha *da[i] +
879
*(double*)beta * db[i];
882
for(i=0; i<elems; i++){
883
DoubleComplex a = ((DoubleComplex*)ptr_a)[i];
884
DoubleComplex b = ((DoubleComplex*)ptr_b)[i];
885
DoubleComplex *ac = (DoubleComplex*)ptr_c;
886
DoubleComplex x= *(DoubleComplex*)alpha;
887
DoubleComplex y= *(DoubleComplex*)beta;
889
ac[i].real = x.real*a.real -
890
x.imag*a.imag + y.real*b.real - y.imag*b.imag;
891
ac[i].imag = x.real*a.imag +
892
x.imag*a.real + y.real*b.imag + y.imag*b.real;
896
for(i=0; i<elems; i++){
897
SingleComplex a = ((SingleComplex*)ptr_a)[i];
898
SingleComplex b = ((SingleComplex*)ptr_b)[i];
899
SingleComplex *ac = (SingleComplex*)ptr_c;
900
SingleComplex x= *(SingleComplex*)alpha;
901
SingleComplex y= *(SingleComplex*)beta;
903
ac[i].real = x.real*a.real -
904
x.imag*a.imag + y.real*b.real - y.imag*b.imag;
905
ac[i].imag = x.real*a.imag +
906
x.imag*a.real + y.real*b.imag + y.imag*b.real;
913
for(i=0; i<elems; i++)
914
fc[i] = *(float*)alpha *fa[i] + *(float*)beta *fb[i];
920
for(i=0; i<elems; i++)
921
ic[i] = *(int*)alpha *ia[i] + *(int*)beta *ib[i];
927
for(i=0; i<elems; i++)
928
lc[i] = *(long*)alpha *la[i] + *(long*)beta *lb[i];
931
lla = (long long*)ptr_a;
932
llb = (long long*)ptr_b;
933
llc = (long long*)ptr_c;
934
for(i=0; i<elems; i++)
935
llc[i] = ( *(long long*)alpha *lla[i] +
936
*(long long*)beta * llb[i] );
940
/* release access to the data */
941
pnga_release_update(g_c, lo, hi);
942
if(g_c != g_a)pnga_release(g_a, lo, hi);
943
if(g_c != g_b)pnga_release(g_b, lo, hi);
948
if(local_sync_end)pnga_pgroup_sync(a_grp);
953
void snga_local_transpose(Integer type, char *ptra, Integer n, Integer stride, char *ptrb)
959
for(i = 0; i< n; i++, ptrb+= stride)
960
*(int*)ptrb= ((int*)ptra)[i];
963
for(i = 0; i< n; i++, ptrb+= stride)
964
*(DoubleComplex*)ptrb= ((DoubleComplex*)ptra)[i];
967
for(i = 0; i< n; i++, ptrb+= stride)
968
*(SingleComplex*)ptrb= ((SingleComplex*)ptra)[i];
971
for(i = 0; i< n; i++, ptrb+= stride)
972
*(double*)ptrb= ((double*)ptra)[i];
975
for(i = 0; i< n; i++, ptrb+= stride)
976
*(float*)ptrb= ((float*)ptra)[i];
979
for(i = 0; i< n; i++, ptrb+= stride)
980
*(long*)ptrb= ((long*)ptra)[i];
983
for(i = 0; i< n; i++, ptrb+= stride)
984
*(long long*)ptrb= ((long long*)ptra)[i];
986
default: pnga_error("bad type:",type);
991
#if HAVE_SYS_WEAK_ALIAS_PRAGMA
992
# pragma weak wnga_transpose = pnga_transpose
994
void pnga_transpose(Integer g_a, Integer g_b)
996
Integer me = pnga_nodeid();
997
Integer nproc = pnga_nnodes();
998
Integer atype, btype, andim, adims[MAXDIM], bndim, bdims[MAXDIM];
1000
int local_sync_begin,local_sync_end;
1001
Integer num_blocks_a;
1002
char *ptr_tmp, *ptr_a;
1004
GA_PUSH_NAME("ga_transpose");
1006
local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
1007
_ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
1008
if(local_sync_begin)pnga_sync();
1010
if(g_a == g_b) pnga_error("arrays have to be different ", 0L);
1012
pnga_inquire(g_a, &atype, &andim, adims);
1013
pnga_inquire(g_b, &btype, &bndim, bdims);
1015
if(bndim != 2 || andim != 2) pnga_error("dimension must be 2",0);
1016
if(atype != btype ) pnga_error("array type mismatch ", 0L);
1018
num_blocks_a = pnga_total_blocks(g_a);
1020
if (num_blocks_a < 0) {
1021
pnga_distribution(g_a, me, lo, hi);
1024
Integer nelem, lob[2], hib[2], nrow, ncol;
1025
int i, size=GAsizeofM(atype);
1027
nrow = hi[0] -lo[0]+1;
1028
ncol = hi[1] -lo[1]+1;
1030
lob[0] = lo[1]; lob[1] = lo[0];
1031
hib[0] = hi[1]; hib[1] = hi[0];
1033
/* allocate memory for transposing elements locally */
1034
ptr_tmp = (char *) ga_malloc(nelem, atype, "transpose_tmp");
1036
/* get access to local data */
1037
pnga_access_ptr(g_a, lo, hi, &ptr_a, ld);
1039
for(i = 0; i < ncol; i++){
1040
char *ptr = ptr_tmp + i*size;
1042
snga_local_transpose(atype, ptr_a, nrow, ncol*size, ptr);
1043
ptr_a += ld[0]*size;
1046
pnga_release(g_a, lo, hi);
1048
pnga_put(g_b, lob, hib, ptr_tmp ,&ncol);
1054
Integer blocks[MAXDIM], block_dims[MAXDIM];
1055
Integer nelem, lob[2], hib[2], nrow, ncol;
1056
int i, size=GAsizeofM(atype);
1058
/* allocate memory for transposing elements locally */
1059
pnga_get_block_info(g_a, blocks, block_dims);
1061
/* Simple block-cyclic data distribution */
1062
nelem = block_dims[0]*block_dims[1];
1063
ptr_tmp = (char *) ga_malloc(nelem, atype, "transpose_tmp");
1064
if (!pnga_uses_proc_grid(g_a)) {
1065
for (idx = me; idx < num_blocks_a; idx += nproc) {
1066
pnga_distribution(g_a, idx, lo, hi);
1067
pnga_access_block_ptr(g_a, idx, &ptr_a, ld);
1069
nrow = hi[0] -lo[0]+1;
1070
ncol = hi[1] -lo[1]+1;
1072
lob[0] = lo[1]; lob[1] = lo[0];
1073
hib[0] = hi[1]; hib[1] = hi[0];
1074
for(i = 0; i < ncol; i++){
1075
char *ptr = ptr_tmp + i*size;
1077
snga_local_transpose(atype, ptr_a, nrow, ncol*size, ptr);
1078
ptr_a += ld[0]*size;
1080
pnga_put(g_b, lob, hib, ptr_tmp ,&ncol);
1082
pnga_release_update_block(g_a, idx);
1085
/* Uses scalapack block-cyclic data distribution */
1087
Integer proc_index[MAXDIM], index[MAXDIM];
1088
Integer topology[MAXDIM], ichk;
1090
pnga_get_proc_index(g_a, me, proc_index);
1091
pnga_get_proc_index(g_a, me, index);
1092
pnga_get_block_info(g_a, blocks, block_dims);
1093
pnga_get_proc_grid(g_a, topology);
1094
/* Verify that processor has data */
1096
for (i=0; i<andim; i++) {
1097
if (index[i]<0 || index[i] >= blocks[i]) {
1103
pnga_access_block_grid_ptr(g_a, index, &ptr_a, ld);
1104
while (index[andim-1] < blocks[andim-1]) {
1105
/* find bounding coordinates of block */
1107
for (i = 0; i < andim; i++) {
1108
lo[i] = index[i]*block_dims[i]+1;
1109
hi[i] = (index[i] + 1)*block_dims[i];
1110
if (hi[i] > adims[i]) hi[i] = adims[i];
1111
if (hi[i] < lo[i]) chk = 0;
1114
pnga_access_block_grid_ptr(g_a, index, &ptr_a, ld);
1115
nrow = hi[0] -lo[0]+1;
1116
ncol = hi[1] -lo[1]+1;
1118
lob[0] = lo[1]; lob[1] = lo[0];
1119
hib[0] = hi[1]; hib[1] = hi[0];
1120
for(i = 0; i < ncol; i++){
1121
char *ptr = ptr_tmp + i*size;
1122
snga_local_transpose(atype, ptr_a, nrow, block_dims[0]*size, ptr);
1123
ptr_a += ld[0]*size;
1125
pnga_put(g_b, lob, hib, ptr_tmp ,&block_dims[0]);
1126
pnga_release_update_block_grid(g_a, index);
1128
/* increment index to get next block on processor */
1129
index[0] += topology[0];
1130
for (i = 0; i < andim; i++) {
1131
if (index[i] >= blocks[i] && i<andim-1) {
1132
index[i] = proc_index[i];
1133
index[i+1] += topology[i+1];
1142
if(local_sync_end)pnga_sync();