1
/**************************************************************
4
Elementwise operations on patches and whole arrays
6
Author: Limin Zhang, Ph.D.
12
Mentor: Jarek Nieplocha.
13
Environmental Molecular Science Laboratory
14
Pacific Northwest National Laboratory
20
to design and implement some interfaces between TAO and
23
Modified 3/2004 By Doug Baxter to increase robustness.
25
**************************************************************/
31
#ifndef GA_HALF_MAX_INT
32
#define GA_HALF_MAX_INT ((((int)1) << ((int)(8*sizeof(int))-2)) - 1)
36
#define GA_INFINITY_I (GA_HALF_MAX_INT + GA_HALF_MAX_INT + 1)
39
Seemed too small arbitrarily.
40
#define GA_INFINITY_I 100000
44
#ifndef GA_NEGATIVE_INFINITY_I
45
#define GA_NEGATIVE_INFINITY_I (- GA_INFINITY_I)
50
Seemed too small arbitrarily.
51
#define GA_NEGATIVE_INFINITY_I -100000
55
#ifndef GA_HALF_MAX_LONG
56
#define GA_HALF_MAX_LONG ((((long)1) << ((int)(8*sizeof(long))-2)) - 1)
60
#define GA_INFINITY_L (GA_HALF_MAX_LONG + GA_HALF_MAX_LONG + 1)
62
#define GA_INFINITY_L 100000
66
#ifndef GA_NEGATIVE_INFINITY_L
67
#define GA_NEGATIVE_INFINITY_L (- GA_INFINITY_L)
71
#define GA_NEGATIVE_INFINITY_L -100000
75
Modified by Doug Baxter 01/24/04 to distinguish between
76
Double inifinity and float infinity.
78
#define GA_INFINITY 1.0e20
81
#ifndef GA_NEGATIVE_INFINITY
82
#define GA_NEGATIVE_INFINITY -1.0e20
86
#define GA_INFINITY_F 1.0e37
90
#define GA_INFINITY_F 1.0e20
92
#ifndef GA_NEGATIVE_INFINITY_F
93
#define GA_NEGATIVE_INFINITY_F -1.0e37
97
#define GA_NEGATIVE_INFINITY_F -1.0e20
100
#define GA_INFINITY_D 1.0e307
103
Original value below.
104
#define GA_INFINITY_D 1.0e20
106
#ifndef GA_NEGATIVE_INFINITY_D
107
#define GA_NEGATIVE_INFINITY_D -1.0e307
110
Original value below.
111
#define GA_NEGATIVE_INFINITY_D -1.0e20
114
End of 01/24/04 Modification.
115
Perhaps it would be more appropriate to have GA_INFINITY_D BE 1.0e307
116
These ranges make assumptions about the data.
119
#define OP_ADD_CONST 1
121
#define OP_ELEM_MULT 3
122
#define OP_ELEM_DIV 4
123
#define OP_ELEM_MAX 5
124
#define OP_ELEM_MIN 6
126
#define OP_STEPBOUNDINFO 8
127
#define OP_ELEM_SDIV 9
128
#define OP_ELEM_SDIV2 10
129
#define OP_STEP_MASK 11
130
#define OP_FILL 100 /*The OP_FILL is not currently in use */
132
int debug_gai_oper_elem = 1;
134
static void do_stepboundinfo(void *ptr, int nelem, int type)
135
/*look at elements one by one and replace the positive infinity with negative infinity */
142
DoubleComplex *ca,val;
146
/*Only double data type will be handled for TAO/GA project*/
149
/* Modified 01/24/04 to add _D ending to constants. */
150
if(da[i]>=GA_INFINITY_D) da[i]=-GA_INFINITY_D;
153
/* This block added 01/24/04 */
155
for (i=0;i<nelem;i++)
156
if (ia[i]>= GA_INFINITY_I) ia[i] = GA_NEGATIVE_INFINITY_I;
160
/* This operation is not well defined for complex
161
numbers . This statement added when drop through
162
behavior changed by adding code for C_FLOAT and C_LONG
163
cases below. 01/24/04
165
ga_error("do_stepboundinfo:wrong data type",type);
167
/* This case added 01/24/04 */
169
for (i=0;i<nelem;i++)
170
if (fa[i]>= GA_INFINITY_F) fa[i] = GA_NEGATIVE_INFINITY_F;
173
/* This case added 01/24/04 */
175
for (i=0;i<nelem;i++)
176
if (la[i]>= GA_INFINITY_L) la[i] = GA_NEGATIVE_INFINITY_L;
178
default: ga_error("do_stepboundinfo:wrong data type",type);
181
static void do_stepmax(void *ptr, int nelem, int type)
183
Look at elements one by one and replace the positive with negative infinity.
195
/*Only double data type will be handled for TAO/GA project*/
198
/* Modified 01/24/04 to use _D ending. */
200
if(da[i]>d_0) da[i]=-GA_INFINITY_D;
203
/* Thix case added 01/24/04*/
207
if(ia[i]>i_0)ia[i]=-GA_INFINITY_I;
211
/* This operation is not well defined for complex
212
numbers . This statement added when drop through
213
behavior changed by adding code for C_FLOAT and C_LONG
214
cases below. 01/24/04
216
ga_error("do_stepmax:wrong data type",type);
218
/* Thix case added 01/24/04*/
222
if(fa[i]>f_0) fa[i]=-GA_INFINITY_F;
225
/* Thix case added 01/24/04*/
229
if(la[i]>l_0) la[i]=-GA_INFINITY_L;
231
default: ga_error("do_stepmax:wrong data type",type);
238
static void do_abs(void *ptr, int nelem, int type)
241
double magi, magr, x1, x2;
242
float smagi, smagr, sx1, sx2;
247
DoubleComplex *ca,val;
248
SingleComplex *cfa,cval;
254
ia[i]= GA_ABS(ia[i]);
257
ca = (DoubleComplex *) ptr;
258
for(i=0;i<nelem;i++){
260
/* DJB: This algorithm can lead to overflows when
261
and underflows when not necessary.
262
ca[i].real = sqrt(val.real * val.real + val.imag *val.imag);
264
Better (but slower) is:
266
magi = GA_ABS(val.imag);
267
magr = GA_ABS(val.real);
268
if (GA_ABS(val.real) >= GA_ABS(val.imag)) {
269
if (val.real == (double)0.0) {
270
ca[i].real = (double)0.0;
272
x2 = val.imag/val.real;
273
ca[i].real = GA_ABS(val.real)*sqrt(((double)1.0)+(x2*x2));
276
x2 = val.real/val.imag;
277
ca[i].real = GA_ABS(val.imag)*sqrt(((double)1.0)+(x2*x2));
279
ca[i].imag=(double)0.0;
283
cfa = (SingleComplex *) ptr;
284
for(i=0;i<nelem;i++){
286
/* DJB: This algorithm can lead to overflows when
287
and underflows when not necessary.
288
cfa[i].real = sqrt(cval.real * cval.real + cval.imag *cval.imag);
290
Better (but slower) is:
292
smagi = GA_ABS(cval.imag);
293
smagr = GA_ABS(cval.real);
294
if (GA_ABS(val.real) >= GA_ABS(val.imag)) {
295
if (cval.real == (float)0.0) {
296
cfa[i].real = (float)0.0;
298
sx2 = cval.imag/cval.real;
299
cfa[i].real = GA_ABS(cval.real)*sqrt(((float)1.0)+(sx2*sx2));
302
sx2 = cval.real/cval.imag;
303
cfa[i].real = GA_ABS(cval.imag)*sqrt(((float)1.0)+(sx2*sx2));
305
cfa[i].imag=(float)0.0;
311
da[i]= GA_ABS(da[i]);
316
fa[i]= GA_ABS(fa[i]);
321
la[i]= GA_ABS(la[i]);
324
default: ga_error("wrong data type",type);
328
static void do_recip(void *ptr, int nelem, int type)
331
DJB general comment, as I found this routine, it
332
would return some form of infinity when having
333
a zero denominator. I find that technically incorrect
334
and the commented out error message to be preferrable.
335
If returning infinity is the behavior desired I would recommend
336
having a separate specialized reciprocal, where it's
337
in the GA documentation, what should happen on a divide
338
by zero. In IEEE standard, the default is to throw
339
a floating point exception (FPE) when division by zero
340
occurs. The ga_error message seems to be the
341
moral equivalent of that.
342
I have commented out the Infinity returns and returned
343
to the ga_error calls. Also on the commented out
344
infinity value returns I have been more specific in
345
the INFINITY type returned (the trailing _*).
347
double magi, magr, x1, x2, c, d;
348
float smagi, smagr, sx1, sx2, sc, sd;
354
DoubleComplex *ca,val;
355
SingleComplex *cfa,cval;
361
if(ia[i]!=0) ia[i]= 1/ia[i];
363
ga_error("zero value at index",i);
365
ia[i] = GA_INFINITY_I;
369
ca = (DoubleComplex *) ptr;
370
for(i=0;i<nelem;i++){
372
Again, as for absolute value the following
373
algorithm can lead to unecessary overflow/underflow
375
temp = ca[i].real*ca[i].real + ca[i].imag*ca[i].imag;
377
ca[i].real =ca[i].real/temp;
378
ca[i].imag =-ca[i].imag/temp;
381
ga_error("zero value at index",i);
383
ca[i].real = GA_INFINITY_D;
384
ca[i].imag = GA_INFINITY_D;
387
Better (but slower) is:
392
printf(" do_recip i = %d, x1 = %le, x2 = %le\n",
398
printf(" do_recip i = %d, magr = %le, magi = %le\n",
402
if (magr != ((double)0.0)) {
404
d = ((double)1.0)/((((double)1.0) + (c*c))*x1);
408
ga_error("zero value at index",i);
412
d = ((double)1.0)/((((double)1.0) + (c*c))*x2);
417
printf(" do_recip ca[%d].real = %le, ca[%d].imag = %le\n",
418
i,ca[i].real,i,ca[i].imag);
424
cfa = (SingleComplex *) ptr;
425
for(i=0;i<nelem;i++){
427
Again, as for absolute value the following
428
algorithm can lead to unecessary overflow/underflow
430
temp = cfa[i].real*cfa[i].real + cfa[i].imag*cfa[i].imag;
432
cfa[i].real =cfa[i].real/temp;
433
cfa[i].imag =-cfa[i].imag/temp;
436
ga_error("zero value at index",i);
438
cfa[i].real = GA_INFINITY_D;
439
cfa[i].imag = GA_INFINITY_D;
442
Better (but slower) is:
447
printf(" do_recip i = %d, x1 = %le, x2 = %le\n",
453
printf(" do_recip i = %d, magr = %le, magi = %le\n",
456
if (smagr >= smagi) {
457
if (smagr != ((float)0.0)) {
459
sd = ((float)1.0)/((((float)1.0) + (sc*sc))*sx1);
461
cfa[i].imag = -sc*sd;
463
ga_error("zero value at index",i);
467
sd = ((float)1.0)/((((float)1.0) + (sc*sc))*sx2);
472
printf(" do_recip ca[%d].real = %le, ca[%d].imag = %le\n",
473
i,ca[i].real,i,ca[i].imag);
481
if(da[i]!=(double)0.0) da[i]= ((double)1.0)/da[i];
483
ga_error("zero value at index",i);
485
da[i] = GA_INFINITY_D;
491
if(fa[i]!=(float)0.0) fa[i]= ((float)1.0)/fa[i];
493
ga_error("zero value at index",i);
495
fa[i] = GA_INFINITY_F
501
if(la[i]!=(long)0) la[i]= ((long)1)/la[i];
503
ga_error("zero value at index",i);
505
la[i] = GA_INFINITY_I;
510
default: ga_error("wrong data type",type);
514
static void do_add_const(void *ptr, int nelem, int type, void *alpha)
521
DoubleComplex *ca,val;
522
SingleComplex *cfa,cval;
528
ia[i] += *(int *)alpha;
531
ca = (DoubleComplex *) ptr;
532
for(i=0;i<nelem;i++){
533
val = *(DoubleComplex*)alpha;
534
ca[i].real += val.real;
535
ca[i].imag += val.imag;
539
cfa = (SingleComplex *) ptr;
540
for(i=0;i<nelem;i++){
541
cval = *(SingleComplex*)alpha;
542
cfa[i].real += cval.real;
543
cfa[i].imag += cval.imag;
549
da[i] += *(double*)alpha;
554
fa[i] += *(float*)alpha;
559
la[i] += *(long *)alpha;
562
default: ga_error("wrong data type",type);
567
void do_fill(void *ptr, int nelem, int type, void *alpha)
574
DoubleComplex *ca,val;
580
ia[i] = *(int *)alpha;
583
ca = (DoubleComplex *) ptr;
584
for(i=0;i<nelem;i++){
585
val = *(DoubleComplex*)alpha;
586
ca[i].real = val.real;
587
ca[i].imag = val.imag;
591
ca = (SingleComplex *) ptr;
592
for(i=0;i<nelem;i++){
593
val = *(SingleComplex*)alpha;
594
ca[i].real = val.real;
595
ca[i].imag = val.imag;
601
da[i] = *(double*)alpha;
606
fa[i] = *(float*)alpha;
611
la[i] = *(long *)alpha;
614
default: ga_error("wrong data type",type);
622
int *g_a -- the global array handle
623
int *lo, *hi--the integer arrays that define the patch of the global array
624
void *scalar -- the pointer that points to the data to pass. When it is NULL, no scalar will be passed.
625
int op -- the operations to handle. For example op can be
627
OP_ABS for pointwise taking absolute function
628
OP_ADD_CONSTANT 2 for pointwise adding the same constant
629
OP_RECIP for pointwise taking reciprocal
630
OP_FILL for pointwise filling value
638
/*\ Internal utility function to do operation.
640
void ngai_do_oper_elem(Integer type, Integer ndim, Integer *loA, Integer *hiA,
641
Integer *ld, void *data_ptr, void *scalar, Integer op)
644
Integer bvalue[MAXDIM], bunit[MAXDIM], baseld[MAXDIM];
647
/* number of n-element of the first dimension */
648
n1dim = 1; for(i=1; i<ndim; i++) n1dim *= (hiA[i] - loA[i] + 1);
650
/* calculate the destination indices */
651
bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
653
* baseld[1] = ld[0] * ld[1]
654
* baseld[2] = ld[0] * ld[1] * ld[2] ....
656
baseld[0] = ld[0]; baseld[1] = baseld[0] *ld[1];
657
for(i=2; i<ndim; i++) {
659
bunit[i] = bunit[i-1] * (hiA[i-1] - loA[i-1] + 1);
660
baseld[i] = baseld[i-1] * ld[i];
663
for(i=0; i<n1dim; i++) {
665
for(j=1; j<ndim; j++) {
666
idx += bvalue[j] * baseld[j-1];
667
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
668
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
673
temp=((int*)data_ptr)+idx;
676
temp=((DoubleComplex*)data_ptr)+idx;
679
temp=((SingleComplex*)data_ptr)+idx;
682
temp=((double*)data_ptr)+idx;
685
temp=((float*)data_ptr)+idx;
688
temp=((long *)data_ptr)+idx;
690
default: ga_error("wrong data type.",type);
696
do_abs(temp ,hiA[0] -loA[0] +1, type); break;
699
do_add_const(temp ,hiA[0] -loA[0] +1, type, scalar);
702
do_recip(temp ,hiA[0] -loA[0] +1, type); break;
704
default: ga_error("bad operation",op);
710
static void FATR gai_oper_elem(Integer *g_a, Integer *lo, Integer *hi, void *scalar, Integer op)
714
Integer ndim, dims[MAXDIM], type;
715
Integer loA[MAXDIM], hiA[MAXDIM], ld[MAXDIM];
716
void *temp, *data_ptr;
717
Integer me= ga_nodeid_();
719
int local_sync_begin,local_sync_end;
721
local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
722
_ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
723
if(local_sync_begin)ga_sync_();
725
ga_check_handle(g_a, "gai_oper_elem");
727
GA_PUSH_NAME("gai_oper_elem");
729
nga_inquire_internal_(g_a, &type, &ndim, dims);
730
num_blocks = ga_total_blocks_(g_a);
732
if (num_blocks < 0) {
733
/* get limits of VISIBLE patch */
734
nga_distribution_(g_a, &me, loA, hiA);
736
/* determine subset of my local patch to access */
737
/* Output is in loA and hiA */
738
if(ngai_patch_intersect(lo, hi, loA, hiA, ndim)){
740
/* get data_ptr to corner of patch */
741
/* ld are leading dimensions INCLUDING ghost cells */
742
nga_access_ptr(g_a, loA, hiA, &data_ptr, ld);
744
/* perform operation on all elements in local patch */
745
ngai_do_oper_elem(type, ndim, loA, hiA, ld, data_ptr, scalar, op);
747
/* release access to the data */
748
nga_release_update_(g_a, loA, hiA);
751
Integer offset, j, jtmp, chk;
753
Integer nproc = ga_nnodes_();
754
/* using simple block-cyclic data distribution */
755
if (!ga_uses_proc_grid_(g_a)){
756
for (i=me; i<num_blocks; i += nproc) {
758
/* get limits of patch */
759
nga_distribution_(g_a, &i, loA, hiA);
761
/* loA is changed by ngai_patch_intersect, so
763
for (j=0; j<ndim; j++) {
767
/* determine subset of my local patch to access */
768
/* Output is in loA and hiA */
769
if(ngai_patch_intersect(lo, hi, loA, hiA, ndim)){
771
/* get data_ptr to corner of patch */
772
/* ld are leading dimensions for block */
773
nga_access_block_ptr(g_a, &i, &data_ptr, ld);
775
/* Check for partial overlap */
777
for (j=0; j<ndim; j++) {
778
if (loS[j] < loA[j]) {
784
/* Evaluate additional offset for pointer */
787
for (j=0; j<ndim-1; j++) {
788
offset += (loA[j]-loS[j])*jtmp;
791
offset += (loA[ndim-1]-loS[ndim-1])*jtmp;
794
data_ptr = (void*)((int*)data_ptr + offset);
797
data_ptr = (void*)((double*)data_ptr + 2*offset);
800
data_ptr = (void*)((float*)data_ptr + 2*offset);
803
data_ptr = (void*)((double*)data_ptr + offset);
806
data_ptr = (void*)((float*)data_ptr + offset);
809
data_ptr = (void*)((long*)data_ptr + offset);
811
default: ga_error(" wrong data type ",type);
814
/* perform operation on all elements in local patch */
815
ngai_do_oper_elem(type, ndim, loA, hiA, ld, data_ptr, scalar, op);
817
/* release access to the data */
818
nga_release_update_block_(g_a, &i);
822
/* using scalapack block-cyclic data distribution */
823
Integer proc_index[MAXDIM], index[MAXDIM];
824
Integer topology[MAXDIM];
825
Integer blocks[MAXDIM], block_dims[MAXDIM];
826
ga_get_proc_index_(g_a, &me, proc_index);
827
ga_get_proc_index_(g_a, &me, index);
828
ga_get_block_info_(g_a, blocks, block_dims);
829
ga_get_proc_grid_(g_a, topology);
830
while (index[ndim-1] < blocks[ndim-1]) {
831
/* find bounding coordinates of block */
833
for (i = 0; i < ndim; i++) {
834
loA[i] = index[i]*block_dims[i]+1;
835
hiA[i] = (index[i] + 1)*block_dims[i];
836
if (hiA[i] > dims[i]) hiA[i] = dims[i];
837
if (hiA[i] < loA[i]) chk = 0;
840
/* loA is changed by ngai_patch_intersect, so
842
for (j=0; j<ndim; j++) {
846
/* determine subset of my local patch to access */
847
/* Output is in loA and hiA */
848
if(ngai_patch_intersect(lo, hi, loA, hiA, ndim)){
850
/* get data_ptr to corner of patch */
851
/* ld are leading dimensions for block */
852
nga_access_block_grid_ptr(g_a, index, &data_ptr, ld);
854
/* Check for partial overlap */
856
for (j=0; j<ndim; j++) {
857
if (loS[j] < loA[j]) {
863
/* Evaluate additional offset for pointer */
866
for (j=0; j<ndim-1; j++) {
867
offset += (loA[j]-loS[j])*jtmp;
870
offset += (loA[ndim-1]-loS[ndim-1])*jtmp;
873
data_ptr = (void*)((int*)data_ptr + offset);
876
data_ptr = (void*)((double*)data_ptr + 2*offset);
879
data_ptr = (void*)((float*)data_ptr + 2*offset);
882
data_ptr = (void*)((double*)data_ptr + offset);
885
data_ptr = (void*)((float*)data_ptr + offset);
888
data_ptr = (void*)((long*)data_ptr + offset);
890
default: ga_error(" wrong data type ",type);
894
/* perform operation on all elements in local patch */
895
ngai_do_oper_elem(type, ndim, loA, hiA, ld, data_ptr, scalar, op);
897
/* release access to the data */
898
nga_release_update_block_grid_(g_a, index);
900
/* increment index to get next block on processor */
901
index[0] += topology[0];
902
for (i = 0; i < ndim; i++) {
903
if (index[i] >= blocks[i] && i<ndim-1) {
904
index[i] = proc_index[i];
905
index[i+1] += topology[i+1];
912
if(local_sync_end)ga_sync_();
915
void FATR ga_abs_value_patch_(Integer *g_a, Integer *lo, Integer *hi)
917
gai_oper_elem(g_a, lo, hi, NULL, OP_ABS);
920
void FATR ga_recip_patch_(Integer *g_a, Integer *lo, Integer *hi)
922
gai_oper_elem(g_a, lo, hi, NULL, OP_RECIP);
926
void FATR ga_add_constant_patch_(Integer *g_a, Integer *lo, Integer *hi, void *alpha)
928
gai_oper_elem(g_a, lo, hi, alpha, OP_ADD_CONST);
932
void FATR ga_abs_value_(Integer *g_a)
935
Integer lo[MAXDIM],hi[MAXDIM];
937
nga_inquire_internal_(g_a, &type, &ndim, hi);
942
_ga_sync_begin = 1; /*just to be on the safe side*/
943
gai_oper_elem(g_a, lo, hi, NULL, OP_ABS);
946
void FATR ga_add_constant_(Integer *g_a, void *alpha)
949
Integer lo[MAXDIM],hi[MAXDIM];
951
nga_inquire_internal_(g_a, &type, &ndim, hi);
956
_ga_sync_begin = 1; /*just to be on the safe side*/
957
gai_oper_elem(g_a, lo, hi, alpha, OP_ADD_CONST);
960
void FATR ga_recip_(Integer *g_a)
963
Integer lo[MAXDIM],hi[MAXDIM];
965
nga_inquire_internal_(g_a, &type, &ndim, hi);
970
_ga_sync_begin = 1; /*just to be on the safe side*/
971
gai_oper_elem(g_a, lo, hi, NULL, OP_RECIP);
976
static void do_multiply(void *pA, void *pB, void *pC, Integer nelems, Integer type){
980
double aReal, aImag, bReal, bImag, cReal, cImag;
983
for(i = 0; i<nelems; i++)
984
((double*)pC)[i]= ((double*)pA)[i]*((double*)pB)[i];
987
for(i = 0; i<nelems; i++) {
988
aReal = ((DoubleComplex*)pA)[i].real;
989
bReal = ((DoubleComplex*)pB)[i].real;
990
aImag = ((DoubleComplex*)pA)[i].imag;
991
bImag = ((DoubleComplex*)pB)[i].imag;
992
((DoubleComplex*)pC)[i].real = aReal*bReal-aImag*bImag;
993
((DoubleComplex*)pC)[i].imag = aReal*bImag+aImag*bReal;
997
for(i = 0; i<nelems; i++) {
998
aReal = ((SingleComplex*)pA)[i].real;
999
bReal = ((SingleComplex*)pB)[i].real;
1000
aImag = ((SingleComplex*)pA)[i].imag;
1001
bImag = ((SingleComplex*)pB)[i].imag;
1002
((SingleComplex*)pC)[i].real = aReal*bReal-aImag*bImag;
1003
((SingleComplex*)pC)[i].imag = aReal*bImag+aImag*bReal;
1007
for(i = 0; i<nelems; i++)
1008
((int*)pC)[i] = ((int*)pA)[i]* ((int*)pB)[i];
1011
for(i = 0; i<nelems; i++)
1012
((float*)pC)[i]= ((float*)pA)[i]*((float*)pB)[i];
1015
for(i = 0; i<nelems; i++)
1016
((long *)pC)[i]= ((long *)pA)[i]* ((long *)pB)[i];
1019
default: ga_error(" wrong data type ",type);
1024
static void do_divide(void *pA, void *pB, void *pC, Integer nelems, Integer type){
1026
Modified by Doug Baxter to call ga_error on divide by 0.
1027
A do_step_divide is added below to return properly signed
1028
infinity for the step_max functions.
1031
double aReal, aImag, bReal, bImag, cReal, cImag;
1037
for(i = 0; i<nelems; i++) {
1038
if(((double*)pB)[i]!=(double)0.0)
1039
((double*)pC)[i]= ((double*)pA)[i]/((double*)pB)[i];
1041
ga_error("zero divisor ",((double*)pB)[i]);
1046
for(i = 0; i<nelems; i++) {
1047
aReal = ((DoubleComplex*)pA)[i].real;
1048
bReal = ((DoubleComplex*)pB)[i].real;
1049
aImag = ((DoubleComplex*)pA)[i].imag;
1050
bImag = ((DoubleComplex*)pB)[i].imag;
1052
The following original algorithm overflows
1054
temp = bReal*bReal+bImag*bImag;
1056
((DoubleComplex*)pC)[i].real
1057
=(aReal*bReal+aImag*bImag)/temp;
1058
((DoubleComplex*)pC)[i].imag
1059
=(aImag*bReal-aReal*bImag)/temp;
1062
ga_error("zero divisor ",temp);
1065
if (GA_ABS(bReal) >= GA_ABS(bImag)) {
1066
if (bReal != (double)0.0) {
1069
x2 = ((double)1.0)/(bReal*(((double)1.0)+(x1*x1)));
1070
((DoubleComplex*)pC)[i].real = (aReal + aImag*x1)*x2;
1071
((DoubleComplex*)pC)[i].imag = (aImag - aReal*x1)*x2;
1074
ga_error("zero divisor ",bReal);
1079
x2 = ((double)1.0)/(bImag*(((double)1.0)+(x1*x1)));
1080
((DoubleComplex*)pC)[i].real = (aReal*x1 + aImag)*x2;
1081
((DoubleComplex*)pC)[i].imag = (aImag*x1 - aReal)*x2;
1086
for(i = 0; i<nelems; i++) {
1087
aReal = ((SingleComplex*)pA)[i].real;
1088
bReal = ((SingleComplex*)pB)[i].real;
1089
aImag = ((SingleComplex*)pA)[i].imag;
1090
bImag = ((SingleComplex*)pB)[i].imag;
1092
The following original algorithm overflows
1094
temp = bReal*bReal+bImag*bImag;
1096
((SingleComplex*)pC)[i].real
1097
=(aReal*bReal+aImag*bImag)/temp;
1098
((SingleComplex*)pC)[i].imag
1099
=(aImag*bReal-aReal*bImag)/temp;
1102
ga_error("zero divisor ",temp);
1105
if (GA_ABS(bReal) >= GA_ABS(bImag)) {
1106
if (bReal != (float)0.0) {
1109
x2 = ((float)1.0)/(bReal*(((float)1.0)+(x1*x1)));
1110
((SingleComplex*)pC)[i].real = (aReal + aImag*x1)*x2;
1111
((SingleComplex*)pC)[i].imag = (aImag - aReal*x1)*x2;
1114
ga_error("zero divisor ",bReal);
1119
x2 = ((float)1.0)/(bImag*(((float)1.0)+(x1*x1)));
1120
((SingleComplex*)pC)[i].real = (aReal*x1 + aImag)*x2;
1121
((SingleComplex*)pC)[i].imag = (aImag*x1 - aReal)*x2;
1126
for(i = 0; i<nelems; i++){
1127
if(((int*)pB)[i]!=0)
1128
((int*)pC)[i] = ((int*)pA)[i]/((int*)pB)[i];
1130
ga_error("zero divisor ",((int*)pB)[i]);
1135
for(i = 0; i<nelems; i++){
1136
if(((float*)pB)[i]!=(float)0.0)
1137
((float*)pC)[i]= ((float*)pA)[i]/((float*)pB)[i];
1139
ga_error("zero divisor ",((float*)pB)[i]);
1144
for(i = 0; i<nelems; i++){
1145
if(((long *)pB)[i]!=0)
1146
((long *)pC)[i]= ((long *)pA)[i]/((long *)pB)[i];
1148
ga_error("zero divisor ",((long*)pB)[i]);
1152
default: ga_error(" wrong data type ",type);
1157
static void do_step_divide(void *pA, void *pB, void *pC, Integer nelems, Integer type){
1158
/* Elementwise divide, not aborting on a zero denominator, but
1159
returning an infinity. If an element in the numerator vector (PA)
1160
is zero, then infinity is returned if the corresponding denominator
1161
element is non-negative, else zero is returned for that element.
1177
for(i = 0; i<nelems; i++) {
1178
if(((double*)pA)[i] == d_0) {
1179
if(((double*)pB)[i]>=d_0) {
1180
((double*)pC)[i]= GA_INFINITY_D;
1182
((double*)pC)[i]= d_0;
1185
if(((double*)pB)[i]!=d_0)
1186
((double*)pC)[i]= ((double*)pA)[i]/((double*)pB)[i];
1188
/* if b is zero an infinite number could be added without
1189
changing the sign of a.
1191
((double*)pC)[i]= GA_INFINITY_D;
1197
ga_error(" do_step_divide called with type C_DCPL",C_DCPL);
1200
ga_error(" do_step_divide called with type C_SCPL",C_SCPL);
1204
for(i = 0; i<nelems; i++){
1205
if(((int*)pA)[i]==i_0) {
1206
if(((int*)pB)[i]>=i_0) {
1207
((int*)pC)[i]=GA_INFINITY_I;
1212
if(((int*)pB)[i]!=i_0)
1213
((int*)pC)[i] = ((int*)pA)[i]/((int*)pB)[i];
1215
((int*)pC)[i]=GA_INFINITY_I;
1222
for(i = 0; i<nelems; i++){
1223
if(((float*)pA)[i]==f_0) {
1224
if(((float*)pB)[i]>=f_0) {
1225
((float*)pC)[i]= GA_INFINITY_F;
1227
((float*)pC)[i]= f_0;
1230
if(((float*)pB)[i]!=f_0) {
1231
((float*)pC)[i]= ((float*)pA)[i]/((float*)pB)[i];
1233
/* _F added 01/24/04 */
1234
((float*)pC)[i]= GA_INFINITY_F;
1241
for(i = 0; i<nelems; i++){
1242
if(((long *)pA)[i]==l_0) {
1243
if(((long *)pB)[i]>=l_0) {
1244
((long *)pC)[i] = GA_INFINITY_L;
1246
((long *)pC)[i] = l_0;
1249
if(((long *)pB)[i]!=l_0)
1250
((long *)pC)[i]= ((long *)pA)[i]/((long *)pB)[i];
1252
((long *)pC)[i] = GA_INFINITY_L;
1257
default: ga_error(" wrong data type ",type);
1261
static void do_stepb_divide(void *pA, void *pB, void *pC, Integer nelems, Integer type){
1262
/* Elementwise divide, not aborting on a zero denominator, but
1263
returning an infinity. If an element in the numerator vector (PA)
1264
is zero, then infinity is returned if the corresponding denominator
1265
element is non-negative, else zero is returned for that element.
1281
for(i = 0; i<nelems; i++) {
1282
if(((double*)pA)[i] == d_0) {
1283
if(((double*)pB)[i]>d_0) {
1284
((double*)pC)[i]= d_0;
1286
((double*)pC)[i]= GA_INFINITY_D;
1289
if(((double*)pB)[i]!=d_0)
1290
((double*)pC)[i]= ((double*)pA)[i]/((double*)pB)[i];
1292
/* if b is zero an infinite number could be added without
1293
changing the sign of a.
1295
((double*)pC)[i]= GA_INFINITY_D;
1301
ga_error(" do_stepb_divide called with type C_DCPL",C_DCPL);
1304
ga_error(" do_stepb_divide called with type C_SCPL",C_SCPL);
1308
for(i = 0; i<nelems; i++){
1309
if(((int*)pA)[i]==i_0) {
1310
if(((int*)pB)[i]>i_0) {
1313
((int*)pC)[i]=GA_INFINITY_I;
1316
if(((int*)pB)[i]!=i_0)
1317
((int*)pC)[i] = ((int*)pA)[i]/((int*)pB)[i];
1319
((int*)pC)[i]=GA_INFINITY_I;
1326
for(i = 0; i<nelems; i++){
1327
if(((float*)pA)[i]==f_0) {
1328
if(((float*)pB)[i]>f_0) {
1329
((float*)pC)[i]= f_0;
1331
((float*)pC)[i]= GA_INFINITY_F;
1334
if(((float*)pB)[i]!=f_0) {
1335
((float*)pC)[i]= ((float*)pA)[i]/((float*)pB)[i];
1337
/* _F added 01/24/04 */
1338
((float*)pC)[i]= GA_INFINITY_F;
1345
for(i = 0; i<nelems; i++){
1346
if(((long *)pA)[i]==l_0) {
1347
if(((long *)pB)[i]>l_0) {
1348
((long *)pC)[i] = l_0;
1350
((long *)pC)[i] = GA_INFINITY_L;
1353
if(((long *)pB)[i]!=l_0)
1354
((long *)pC)[i]= ((long *)pA)[i]/((long *)pB)[i];
1356
((long *)pC)[i] = GA_INFINITY_L;
1361
default: ga_error(" do_stepb_divide: wrong data type ",type);
1365
static void do_step_mask(void *pA, void *pB, void *pC, Integer nelems, Integer type){
1367
Set vector C to vector B wherever vector A is nonzero,
1368
and to zero wherever vector A is zero.
1371
double aReal, aImag, bReal, bImag, cReal, cImag;
1377
for(i = 0; i<nelems; i++) {
1378
if(((double*)pA)[i]!=(double)0.0) {
1379
((double*)pC)[i]= ((double*)pB)[i];
1381
((double*)pC)[i]= (double)0.0;
1386
ga_error(" do_step_mask called with type C_DCPL",C_DCPL);
1389
ga_error(" do_step_mask called with type C_SCPL",C_SCPL);
1392
for(i = 0; i<nelems; i++){
1393
if(((int*)pA)[i]!=(int)0){
1394
((int*)pC)[i] = ((int*)pB)[i];
1396
((int*)pC)[i] = (int)0;
1401
for(i = 0; i<nelems; i++){
1402
if(((float*)pA)[i]!=(float)0.0) {
1403
((float*)pC)[i]= ((float*)pB)[i];
1405
((float*)pC)[i]= (float)0.0;
1410
for(i = 0; i<nelems; i++){
1411
if(((long *)pA)[i]!=(long)0){
1412
((long *)pC)[i]= ((long *)pB)[i];
1414
((long *)pC)[i]=(long)0;
1418
default: ga_error(" do_step_mask: wrong data type ",type);
1423
static void do_maximum(void *pA, void *pB, void *pC, Integer nelems, Integer type){
1425
This routine was modified by DJB to scale components
1426
so as not to unecessarily overflow.
1429
double aReal, aImag, bReal, bImag, cReal, cImag, temp1, temp2;
1434
for(i = 0; i<nelems; i++)
1435
((double*)pC)[i] = GA_MAX(((double*)pA)[i],((double*)pB)[i]);
1438
for(i = 0; i<nelems; i++) {
1439
aReal = ((DoubleComplex*)pA)[i].real;
1440
bReal = ((DoubleComplex*)pB)[i].real;
1441
aImag = ((DoubleComplex*)pA)[i].imag;
1442
bImag = ((DoubleComplex*)pB)[i].imag;
1443
x1 = GA_MAX(GA_ABS(aReal),GA_ABS(aImag));
1444
x2 = GA_MAX(GA_ABS(bReal),GA_ABS(bImag));
1446
if (x1 == (double)0.0) {
1447
((DoubleComplex*)pC)[i].real=((DoubleComplex*)pA)[i].real;
1448
((DoubleComplex*)pC)[i].imag=((DoubleComplex*)pA)[i].imag;
1450
x1 = ((double)1.0)/x1;
1455
temp1 = (aReal*aReal)+(aImag*aImag);
1456
temp2 = (bReal*bReal)+(bImag*bImag);
1458
((DoubleComplex*)pC)[i].real=((DoubleComplex*)pA)[i].real;
1459
((DoubleComplex*)pC)[i].imag=((DoubleComplex*)pA)[i].imag;
1462
((DoubleComplex*)pC)[i].real=((DoubleComplex*)pB)[i].real;
1463
((DoubleComplex*)pC)[i].imag=((DoubleComplex*)pB)[i].imag;
1469
for(i = 0; i<nelems; i++) {
1470
aReal = ((SingleComplex*)pA)[i].real;
1471
bReal = ((SingleComplex*)pB)[i].real;
1472
aImag = ((SingleComplex*)pA)[i].imag;
1473
bImag = ((SingleComplex*)pB)[i].imag;
1474
x1 = GA_MAX(GA_ABS(aReal),GA_ABS(aImag));
1475
x2 = GA_MAX(GA_ABS(bReal),GA_ABS(bImag));
1477
if (x1 == (double)0.0) {
1478
((SingleComplex*)pC)[i].real=((SingleComplex*)pA)[i].real;
1479
((SingleComplex*)pC)[i].imag=((SingleComplex*)pA)[i].imag;
1481
x1 = ((double)1.0)/x1;
1486
temp1 = (aReal*aReal)+(aImag*aImag);
1487
temp2 = (bReal*bReal)+(bImag*bImag);
1489
((SingleComplex*)pC)[i].real=((SingleComplex*)pA)[i].real;
1490
((SingleComplex*)pC)[i].imag=((SingleComplex*)pA)[i].imag;
1493
((SingleComplex*)pC)[i].real=((SingleComplex*)pB)[i].real;
1494
((SingleComplex*)pC)[i].imag=((SingleComplex*)pB)[i].imag;
1500
for(i = 0; i<nelems; i++)
1501
((int*)pC)[i] =GA_MAX(((int*)pA)[i],((int*)pB)[i]);
1504
for(i = 0; i<nelems; i++)
1505
((float*)pC)[i]=GA_MAX(((float*)pA)[i],((float*)pB)[i]);
1509
for(i = 0; i<nelems; i++)
1510
((long *)pC)[i]=GA_MAX(((long *)pA)[i],((long *)pB)[i]);
1513
default: ga_error(" wrong data type ",type);
1518
static void do_minimum(void *pA, void *pB, void *pC, Integer nelems, Integer type){
1520
This routine was modified by DJB to scale components
1521
so as not to unecessarily overflow.
1527
double aReal, aImag, bReal, bImag, cReal, cImag, temp1, temp2;
1530
for(i = 0; i<nelems; i++)
1531
((double*)pC)[i] = GA_MIN(((double*)pA)[i],((double*)pB)[i]);
1534
for(i = 0; i<nelems; i++) {
1535
aReal = ((DoubleComplex*)pA)[i].real;
1536
bReal = ((DoubleComplex*)pB)[i].real;
1537
aImag = ((DoubleComplex*)pA)[i].imag;
1538
bImag = ((DoubleComplex*)pB)[i].imag;
1539
x1 = GA_MAX(GA_ABS(aReal),GA_ABS(aImag));
1540
x2 = GA_MAX(GA_ABS(bReal),GA_ABS(bImag));
1542
if (x1 == (double)0.0) {
1543
((DoubleComplex*)pC)[i].real=((DoubleComplex*)pA)[i].real;
1544
((DoubleComplex*)pC)[i].imag=((DoubleComplex*)pA)[i].imag;
1546
x1 = ((double)1.0)/x1;
1551
temp1 = aReal*aReal+aImag*aImag;
1552
temp2 = bReal*bReal+bImag*bImag;
1554
((DoubleComplex*)pC)[i].real=((DoubleComplex*)pA)[i].real;
1555
((DoubleComplex*)pC)[i].imag=((DoubleComplex*)pA)[i].imag;
1558
((DoubleComplex*)pC)[i].real=((DoubleComplex*)pB)[i].real;
1559
((DoubleComplex*)pC)[i].imag=((DoubleComplex*)pB)[i].imag;
1565
for(i = 0; i<nelems; i++) {
1566
aReal = ((SingleComplex*)pA)[i].real;
1567
bReal = ((SingleComplex*)pB)[i].real;
1568
aImag = ((SingleComplex*)pA)[i].imag;
1569
bImag = ((SingleComplex*)pB)[i].imag;
1570
x1 = GA_MAX(GA_ABS(aReal),GA_ABS(aImag));
1571
x2 = GA_MAX(GA_ABS(bReal),GA_ABS(bImag));
1573
if (x1 == (double)0.0) {
1574
((SingleComplex*)pC)[i].real=((SingleComplex*)pA)[i].real;
1575
((SingleComplex*)pC)[i].imag=((SingleComplex*)pA)[i].imag;
1577
x1 = ((double)1.0)/x1;
1582
temp1 = aReal*aReal+aImag*aImag;
1583
temp2 = bReal*bReal+bImag*bImag;
1585
((SingleComplex*)pC)[i].real=((SingleComplex*)pA)[i].real;
1586
((SingleComplex*)pC)[i].imag=((SingleComplex*)pA)[i].imag;
1589
((SingleComplex*)pC)[i].real=((SingleComplex*)pB)[i].real;
1590
((SingleComplex*)pC)[i].imag=((SingleComplex*)pB)[i].imag;
1596
for(i = 0; i<nelems; i++)
1597
((int*)pC)[i] =GA_MIN(((int*)pA)[i],((int*)pB)[i]);
1600
for(i = 0; i<nelems; i++)
1601
((float*)pC)[i]=GA_MIN(((float*)pA)[i],((float*)pB)[i]);
1604
for(i = 0; i<nelems; i++)
1605
((long *)pC)[i]=GA_MIN(((long *)pA)[i],((long *)pB)[i]);
1608
default: ga_error(" wrong data type ",type);
1612
void ngai_do_elem2_oper(Integer atype, Integer cndim, Integer *loC, Integer *hiC,
1613
Integer *ldC, void *A_ptr, void *B_ptr, void *C_ptr, int op)
1616
Integer bvalue[MAXDIM], bunit[MAXDIM], baseldC[MAXDIM];
1617
void *tempA, *tempB, *tempC;
1619
/* compute "local" operation accoording to op */
1621
/* number of n-element of the first dimension */
1622
n1dim = 1; for(i=1; i<cndim; i++) n1dim *= (hiC[i] - loC[i] + 1);
1624
/* calculate the destination indices */
1625
bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
1626
/* baseld[0] = ld[0]
1627
* baseld[1] = ld[0] * ld[1]
1628
* baseld[2] = ld[0] * ld[1] * ld[2] .....
1630
baseldC[0] = ldC[0]; baseldC[1] = baseldC[0] *ldC[1];
1631
for(i=2; i<cndim; i++) {
1633
bunit[i] = bunit[i-1] * (hiC[i-1] - loC[i-1] + 1);
1634
baseldC[i] = baseldC[i-1] * ldC[i];
1638
for(i=0; i<n1dim; i++) {
1640
for(j=1; j<cndim; j++) {
1641
idx += bvalue[j] * baseldC[j-1];
1642
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
1643
if(bvalue[j] > (hiC[j]-loC[j])) bvalue[j] = 0;
1648
tempA=((double*)A_ptr)+idx;
1649
tempB=((double*)B_ptr)+idx;
1650
tempC=((double*)C_ptr)+idx;
1653
tempA=((DoubleComplex*)A_ptr)+idx;
1654
tempB=((DoubleComplex*)B_ptr)+idx;
1655
tempC=((DoubleComplex*)C_ptr)+idx;
1658
tempA=((SingleComplex*)A_ptr)+idx;
1659
tempB=((SingleComplex*)B_ptr)+idx;
1660
tempC=((SingleComplex*)C_ptr)+idx;
1663
tempA=((int*)A_ptr)+idx;
1664
tempB=((int*)B_ptr)+idx;
1665
tempC=((int*)C_ptr)+idx;
1668
tempA=((float*)A_ptr)+idx;
1669
tempB=((float*)B_ptr)+idx;
1670
tempC=((float*)C_ptr)+idx;
1673
tempA=((long *)A_ptr)+idx;
1674
tempB=((long *)B_ptr)+idx;
1675
tempC=((long *)C_ptr)+idx;
1678
default: ga_error(" wrong data type ",atype);
1683
do_multiply(tempA,tempB,tempC,hiC[0]-loC[0]+1,atype);
1686
do_divide(tempA,tempB,tempC,hiC[0]-loC[0]+1,atype);
1689
do_step_divide(tempA,tempB,tempC,hiC[0]-loC[0]+1,atype);
1692
do_stepb_divide(tempA,tempB,tempC,hiC[0]-loC[0]+1,atype);
1695
do_step_mask(tempA,tempB,tempC,hiC[0]-loC[0]+1,atype);
1698
do_maximum(tempA,tempB,tempC,hiC[0]-loC[0]+1,atype);
1701
do_minimum(tempA,tempB,tempC,hiC[0]-loC[0]+1,atype);
1704
printf("op : OP_ELEM_MULT = %d:%d\n", op, OP_ELEM_MULT);
1705
ga_error(" wrong operation ",op);
1710
/*\ generic operation of two patches
1712
static void FATR ngai_elem2_patch_(g_a, alo, ahi, g_b, blo, bhi,
1714
Integer *g_a, *alo, *ahi; /* patch of g_a */
1715
Integer *g_b, *blo, *bhi; /* patch of g_b */
1716
Integer *g_c, *clo, *chi; /* patch of g_c */
1717
int op; /* operation to be perform between g_a and g_b */
1721
Integer atype, btype, ctype;
1722
Integer andim, adims[MAXDIM], bndim, bdims[MAXDIM], cndim, cdims[MAXDIM];
1723
Integer loA[MAXDIM], hiA[MAXDIM], ldA[MAXDIM];
1724
Integer loB[MAXDIM], hiB[MAXDIM], ldB[MAXDIM];
1725
Integer loC[MAXDIM], hiC[MAXDIM], ldC[MAXDIM];
1726
void *A_ptr, *B_ptr, *C_ptr;
1728
Integer atotal, btotal;
1729
Integer g_A = *g_a, g_B = *g_b;
1730
Integer num_blocks_a, num_blocks_b, num_blocks_c;
1731
Integer me= ga_nodeid_(), A_created=0, B_created=0;
1732
char *tempname = "temp", notrans='n';
1733
int local_sync_begin,local_sync_end;
1735
local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
1736
_ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
1737
if(local_sync_begin)ga_sync_();
1738
ga_check_handle(g_a, "gai_elem2_patch_");
1739
GA_PUSH_NAME("ngai_elem2_patch_");
1741
nga_inquire_internal_(g_a, &atype, &andim, adims);
1742
nga_inquire_internal_(g_b, &btype, &bndim, bdims);
1743
nga_inquire_internal_(g_c, &ctype, &cndim, cdims);
1745
if(atype != btype || atype != ctype ) ga_error(" types mismatch ", 0L);
1747
/* check if patch indices and dims match */
1748
for(i=0; i<andim; i++)
1749
if(alo[i] <= 0 || ahi[i] > adims[i])
1750
ga_error("g_a indices out of range ", *g_a);
1751
for(i=0; i<bndim; i++)
1752
if(blo[i] <= 0 || bhi[i] > bdims[i])
1753
ga_error("g_b indices out of range ", *g_b);
1754
for(i=0; i<cndim; i++)
1755
if(clo[i] <= 0 || chi[i] > cdims[i])
1756
ga_error("g_c indices out of range ", *g_c);
1758
/* check if numbers of elements in patches match each other */
1759
n1dim = 1; for(i=0; i<cndim; i++) n1dim *= (chi[i] - clo[i] + 1);
1760
atotal = 1; for(i=0; i<andim; i++) atotal *= (ahi[i] - alo[i] + 1);
1761
btotal = 1; for(i=0; i<bndim; i++) btotal *= (bhi[i] - blo[i] + 1);
1763
if((atotal != n1dim) || (btotal != n1dim))
1764
ga_error(" capacities of patches do not match ", 0L);
1766
num_blocks_a = ga_total_blocks_(g_a);
1767
num_blocks_b = ga_total_blocks_(g_b);
1768
num_blocks_c = ga_total_blocks_(g_c);
1770
if (num_blocks_a < 0 && num_blocks_b < 0 && num_blocks_c < 0) {
1771
/* find out coordinates of patches of g_a, g_b and g_c that I own */
1772
nga_distribution_(&g_A, &me, loA, hiA);
1773
nga_distribution_(&g_B, &me, loB, hiB);
1774
nga_distribution_( g_c, &me, loC, hiC);
1776
/* test if the local portion of patches matches */
1777
if(ngai_comp_patch(andim, loA, hiA, cndim, loC, hiC) &&
1778
ngai_comp_patch(andim, alo, ahi, cndim, clo, chi)) compatible = 1;
1779
else compatible = 0;
1780
ga_igop(GA_TYPE_GSM, &compatible, 1, "*");
1782
/* either patches or distributions do not match:
1783
* - create a temp array that matches distribution of g_c
1787
nga_copy_patch(¬rans, g_a, alo, ahi, g_c, clo, chi);
1790
nga_distribution_(&g_A, &me, loA, hiA);
1793
if (!ga_duplicate(g_c, &g_A, tempname))
1794
ga_error("ga_dadd_patch: dup failed", 0L);
1795
nga_copy_patch(¬rans, g_a, alo, ahi, &g_A, clo, chi);
1798
nga_distribution_(&g_A, &me, loA, hiA);
1802
/* test if the local portion of patches matches */
1803
if(ngai_comp_patch(bndim, loB, hiB, cndim, loC, hiC) &&
1804
ngai_comp_patch(bndim, blo, bhi, cndim, clo, chi)) compatible = 1;
1805
else compatible = 0;
1806
ga_igop(GA_TYPE_GSM, &compatible, 1, "*");
1808
/* either patches or distributions do not match:
1809
* - create a temp array that matches distribution of g_c
1810
* - copy & reshape patch of g_b into g_B
1812
if (!ga_duplicate(g_c, &g_B, tempname))
1813
ga_error("ga_dadd_patch: dup failed", 0L);
1814
nga_copy_patch(¬rans, g_b, blo, bhi, &g_B, clo, chi);
1817
nga_distribution_(&g_B, &me, loB, hiB);
1820
if(andim > bndim) cndim = bndim;
1821
if(andim < bndim) cndim = andim;
1823
if(!ngai_comp_patch(andim, loA, hiA, cndim, loC, hiC))
1824
ga_error(" A patch mismatch ", g_A);
1825
if(!ngai_comp_patch(bndim, loB, hiB, cndim, loC, hiC))
1826
ga_error(" B patch mismatch ", g_B);
1828
/* determine subsets of my patches to access */
1829
if (ngai_patch_intersect(clo, chi, loC, hiC, cndim)){
1830
nga_access_ptr(&g_A, loC, hiC, &A_ptr, ldA);
1831
nga_access_ptr(&g_B, loC, hiC, &B_ptr, ldB);
1832
nga_access_ptr( g_c, loC, hiC, &C_ptr, ldC);
1834
/* compute "local" operation accoording to op */
1835
ngai_do_elem2_oper(atype, cndim, loC, hiC, ldC, A_ptr, B_ptr, C_ptr, op);
1837
/* release access to the data */
1838
nga_release_ (&g_A, loC, hiC);
1839
nga_release_ (&g_B, loC, hiC);
1840
nga_release_update_( g_c, loC, hiC);
1844
/* create copies of arrays A and B that are identically distributed
1846
if (!ga_duplicate(g_c, &g_A, tempname))
1847
ga_error("ga_dadd_patch: dup failed", 0L);
1848
nga_copy_patch(¬rans, g_a, alo, ahi, &g_A, clo, chi);
1852
if (!ga_duplicate(g_c, &g_B, tempname))
1853
ga_error("ga_dadd_patch: dup failed", 0L);
1854
nga_copy_patch(¬rans, g_b, blo, bhi, &g_B, clo, chi);
1858
/* C is normally distributed so just add copies together for regular
1860
if (num_blocks_c < 0) {
1861
nga_distribution_( g_c, &me, loC, hiC);
1862
if(andim > bndim) cndim = bndim;
1863
if(andim < bndim) cndim = andim;
1864
if (ngai_patch_intersect(clo, chi, loC, hiC, cndim)){
1865
nga_access_ptr(&g_A, loC, hiC, &A_ptr, ldA);
1866
nga_access_ptr(&g_B, loC, hiC, &B_ptr, ldB);
1867
nga_access_ptr( g_c, loC, hiC, &C_ptr, ldC);
1869
/* compute "local" operation accoording to op */
1870
ngai_do_elem2_oper(atype, cndim, loC, hiC, ldC, A_ptr, B_ptr, C_ptr, op);
1872
/* release access to the data */
1873
nga_release_ (&g_A, loC, hiC);
1874
nga_release_ (&g_B, loC, hiC);
1875
nga_release_update_( g_c, loC, hiC);
1878
Integer lod[MAXDIM], hid[MAXDIM], chk;
1879
Integer offset, last, jtot;
1880
if (!ga_uses_proc_grid_(g_c)) {
1881
Integer nproc = ga_nnodes_();
1882
for (idx = me; idx < num_blocks_c; idx += nproc) {
1884
nga_distribution_(g_c, &idx, loC, hiC);
1885
/* make temporary copies of loC and hiC since ngai_patch_intersect
1886
destroys original versions */
1887
for (j=0; j<cndim; j++) {
1892
if (ngai_patch_intersect(clo, chi, loC, hiC, cndim)) {
1893
nga_access_block_ptr(&g_A, &idx, &A_ptr, ldA);
1894
nga_access_block_ptr(&g_B, &idx, &B_ptr, ldB);
1895
nga_access_block_ptr( g_c, &idx, &C_ptr, ldC);
1897
/* evaluate offsets for system */
1901
for (j=0; j<last; j++) {
1902
offset += (loC[j] - lod[j])*jtot;
1905
offset += (loC[last]-lod[last])*jtot;
1908
A_ptr = (void*)((double*)(A_ptr) + offset);
1909
B_ptr = (void*)((double*)(B_ptr) + offset);
1910
C_ptr = (void*)((double*)(C_ptr) + offset);
1913
A_ptr = (void*)((int*)(A_ptr) + offset);
1914
B_ptr = (void*)((int*)(B_ptr) + offset);
1915
C_ptr = (void*)((int*)(C_ptr) + offset);
1918
A_ptr = (void*)((DoubleComplex*)(A_ptr) + offset);
1919
B_ptr = (void*)((DoubleComplex*)(B_ptr) + offset);
1920
C_ptr = (void*)((DoubleComplex*)(C_ptr) + offset);
1923
A_ptr = (void*)((SingleComplex*)(A_ptr) + offset);
1924
B_ptr = (void*)((SingleComplex*)(B_ptr) + offset);
1925
C_ptr = (void*)((SingleComplex*)(C_ptr) + offset);
1928
A_ptr = (void*)((float*)(A_ptr) + offset);
1929
B_ptr = (void*)((float*)(B_ptr) + offset);
1930
C_ptr = (void*)((float*)(C_ptr) + offset);
1933
A_ptr = (void*)((long*)(A_ptr) + offset);
1934
B_ptr = (void*)((long*)(B_ptr) + offset);
1935
C_ptr = (void*)((long*)(C_ptr) + offset);
1941
/* compute "local" operation accoording to op */
1942
ngai_do_elem2_oper(atype, cndim, loC, hiC, ldC, A_ptr, B_ptr, C_ptr, op);
1944
/* release access to the data */
1945
nga_release_block_ (&g_A, &idx);
1946
nga_release_block_ (&g_B, &idx);
1947
nga_release_update_block_( g_c, &idx);
1951
/* Uses scalapack block-cyclic data distribution */
1952
Integer proc_index[MAXDIM], index[MAXDIM];
1953
Integer topology[MAXDIM];
1954
Integer blocks[MAXDIM], block_dims[MAXDIM];
1955
ga_get_proc_index_(g_c, &me, proc_index);
1956
ga_get_proc_index_(g_c, &me, index);
1957
ga_get_block_info_(g_c, blocks, block_dims);
1958
ga_get_proc_grid_(g_c, topology);
1959
while (index[cndim-1] < blocks[cndim-1]) {
1960
/* find bounding coordinates of block */
1962
for (i = 0; i < cndim; i++) {
1963
loC[i] = index[i]*block_dims[i]+1;
1964
hiC[i] = (index[i] + 1)*block_dims[i];
1965
if (hiC[i] > cdims[i]) hiC[i] = cdims[i];
1966
if (hiC[i] < loC[i]) chk = 0;
1968
/* make temporary copies of loC and hiC since ngai_patch_intersect
1969
destroys original versions */
1970
for (j=0; j<cndim; j++) {
1975
if (ngai_patch_intersect(clo, chi, loC, hiC, cndim)) {
1976
nga_access_block_grid_ptr(&g_A, index, &A_ptr, ldA);
1977
nga_access_block_grid_ptr(&g_B, index, &B_ptr, ldB);
1978
nga_access_block_grid_ptr( g_c, index, &C_ptr, ldC);
1980
/* evaluate offsets for system */
1984
for (j=0; j<last; j++) {
1985
offset += (loC[j] - lod[j])*jtot;
1988
offset += (loC[last]-lod[last])*jtot;
1991
A_ptr = (void*)((double*)(A_ptr) + offset);
1992
B_ptr = (void*)((double*)(B_ptr) + offset);
1993
C_ptr = (void*)((double*)(C_ptr) + offset);
1996
A_ptr = (void*)((int*)(A_ptr) + offset);
1997
B_ptr = (void*)((int*)(B_ptr) + offset);
1998
C_ptr = (void*)((int*)(C_ptr) + offset);
2001
A_ptr = (void*)((DoubleComplex*)(A_ptr) + offset);
2002
B_ptr = (void*)((DoubleComplex*)(B_ptr) + offset);
2003
C_ptr = (void*)((DoubleComplex*)(C_ptr) + offset);
2006
A_ptr = (void*)((SingleComplex*)(A_ptr) + offset);
2007
B_ptr = (void*)((SingleComplex*)(B_ptr) + offset);
2008
C_ptr = (void*)((SingleComplex*)(C_ptr) + offset);
2011
A_ptr = (void*)((float*)(A_ptr) + offset);
2012
B_ptr = (void*)((float*)(B_ptr) + offset);
2013
C_ptr = (void*)((float*)(C_ptr) + offset);
2016
A_ptr = (void*)((long*)(A_ptr) + offset);
2017
B_ptr = (void*)((long*)(B_ptr) + offset);
2018
C_ptr = (void*)((long*)(C_ptr) + offset);
2024
/* compute "local" operation accoording to op */
2025
ngai_do_elem2_oper(atype, cndim, loC, hiC, ldC, A_ptr, B_ptr, C_ptr, op);
2027
/* release access to the data */
2028
nga_release_block_grid_ (&g_A, index);
2029
nga_release_block_grid_ (&g_B, index);
2030
nga_release_update_block_grid_( g_c, index);
2032
/* increment index to get next block on processor */
2033
index[0] += topology[0];
2034
for (i = 0; i < cndim; i++) {
2035
if (index[i] >= blocks[i] && i<cndim-1) {
2036
index[i] = proc_index[i];
2037
index[i+1] += topology[i+1];
2045
if(A_created) ga_destroy_(&g_A);
2046
if(B_created) ga_destroy_(&g_B);
2049
if(local_sync_end)ga_sync_();
2052
void FATR ga_elem_multiply_(Integer *g_a, Integer *g_b, Integer *g_c){
2054
Integer atype, andim;
2055
Integer btype, bndim;
2056
Integer ctype, cndim;
2057
Integer alo[MAXDIM],ahi[MAXDIM];
2058
Integer blo[MAXDIM],bhi[MAXDIM];
2059
Integer clo[MAXDIM],chi[MAXDIM];
2061
nga_inquire_internal_(g_a, &atype, &andim, ahi);
2062
nga_inquire_internal_(g_b, &btype, &bndim, bhi);
2063
nga_inquire_internal_(g_c, &ctype, &cndim, chi);
2064
if((andim!=bndim)||(andim!=cndim))
2065
ga_error("global arrays have different dimmensions.", andim);
2074
_ga_sync_begin = 1; /*just to be on the safe side*/
2075
ngai_elem2_patch_(g_a, alo, ahi, g_b, blo, bhi,g_c,clo,chi,OP_ELEM_MULT);
2080
void FATR ga_elem_divide_(Integer *g_a, Integer *g_b, Integer *g_c){
2082
Integer atype, andim;
2083
Integer btype, bndim;
2084
Integer ctype, cndim;
2085
Integer alo[MAXDIM],ahi[MAXDIM];
2086
Integer blo[MAXDIM],bhi[MAXDIM];
2087
Integer clo[MAXDIM],chi[MAXDIM];
2089
nga_inquire_internal_(g_a, &atype, &andim, ahi);
2090
nga_inquire_internal_(g_b, &btype, &bndim, bhi);
2091
nga_inquire_internal_(g_c, &ctype, &cndim, chi);
2092
if((andim!=bndim)||(andim!=cndim))
2093
ga_error("global arrays have different dimmensions.", andim);
2103
_ga_sync_begin = 1; /*just to be on the safe side*/
2104
ngai_elem2_patch_(g_a, alo, ahi, g_b, blo, bhi,g_c,clo,chi,OP_ELEM_DIV);
2111
void FATR ga_elem_maximum_(Integer *g_a, Integer *g_b, Integer *g_c){
2113
Integer atype, andim;
2114
Integer btype, bndim;
2115
Integer ctype, cndim;
2116
Integer alo[MAXDIM],ahi[MAXDIM];
2117
Integer blo[MAXDIM],bhi[MAXDIM];
2118
Integer clo[MAXDIM],chi[MAXDIM];
2120
nga_inquire_internal_(g_a, &atype, &andim, ahi);
2121
nga_inquire_internal_(g_b, &btype, &bndim, bhi);
2122
nga_inquire_internal_(g_c, &ctype, &cndim, chi);
2123
if((andim!=bndim)||(andim!=cndim))
2124
ga_error("global arrays have different dimmensions.", andim);
2134
_ga_sync_begin = 1; /*just to be on the safe side*/
2135
ngai_elem2_patch_(g_a, alo, ahi, g_b, blo, bhi,g_c,clo,chi,OP_ELEM_MAX);
2140
void FATR ga_elem_minimum_(Integer *g_a, Integer *g_b, Integer *g_c){
2142
Integer atype, andim;
2143
Integer btype, bndim;
2144
Integer ctype, cndim;
2145
Integer alo[MAXDIM],ahi[MAXDIM];
2146
Integer blo[MAXDIM],bhi[MAXDIM];
2147
Integer clo[MAXDIM],chi[MAXDIM];
2149
nga_inquire_internal_(g_a, &atype, &andim, ahi);
2150
nga_inquire_internal_(g_b, &btype, &bndim, bhi);
2151
nga_inquire_internal_(g_c, &ctype, &cndim, chi);
2152
if((andim!=bndim)||(andim!=cndim))
2153
ga_error("global arrays have different dimmensions.", andim);
2163
_ga_sync_begin = 1; /*just to be on the safe side*/
2164
ngai_elem2_patch_(g_a, alo, ahi, g_b, blo, bhi,g_c,clo,chi,OP_ELEM_MIN);
2168
void FATR ga_elem_multiply_patch_(Integer *g_a,Integer *alo,Integer *ahi,Integer *g_b,Integer *blo,Integer *bhi,Integer *g_c,Integer *clo,Integer *chi){
2170
ngai_elem2_patch_(g_a, alo, ahi, g_b, blo, bhi,g_c,clo,chi,OP_ELEM_MULT);
2174
void FATR ga_elem_divide_patch_(Integer *g_a,Integer *alo,Integer *ahi,
2175
Integer *g_b,Integer *blo,Integer *bhi,Integer *g_c, Integer *clo,Integer *chi){
2177
ngai_elem2_patch_(g_a, alo, ahi, g_b, blo, bhi,g_c,clo,chi,OP_ELEM_DIV);
2181
void FATR ga_elem_step_divide_patch_(Integer *g_a,Integer *alo,Integer *ahi,
2182
Integer *g_b,Integer *blo,Integer *bhi,Integer *g_c, Integer *clo,Integer *chi){
2184
ngai_elem2_patch_(g_a, alo, ahi, g_b, blo, bhi,g_c,clo,chi,OP_ELEM_SDIV);
2188
void FATR ga_elem_stepb_divide_patch_(Integer *g_a,Integer *alo,Integer *ahi,
2189
Integer *g_b,Integer *blo,Integer *bhi,Integer *g_c, Integer *clo,Integer *chi){
2191
ngai_elem2_patch_(g_a, alo, ahi, g_b, blo, bhi,g_c,clo,chi,OP_ELEM_SDIV2);
2194
void FATR ga_step_mask_patch_(Integer *g_a,Integer *alo,Integer *ahi,
2195
Integer *g_b,Integer *blo,Integer *bhi,Integer *g_c, Integer *clo,Integer *chi){
2197
ngai_elem2_patch_(g_a, alo, ahi, g_b, blo, bhi,g_c,clo,chi,OP_STEP_MASK);
2200
void FATR ga_elem_maximum_patch_(Integer *g_a,Integer *alo,Integer *ahi,
2201
Integer *g_b,Integer *blo,Integer *bhi,Integer *g_c,Integer *clo,Integer *chi){
2203
ngai_elem2_patch_(g_a, alo, ahi, g_b, blo, bhi,g_c,clo,chi,OP_ELEM_MAX);
2207
void FATR ga_elem_minimum_patch_(Integer *g_a,Integer *alo,Integer *ahi,
2208
Integer *g_b,Integer *blo,Integer *bhi,Integer *g_c,Integer *clo,Integer *chi){
2210
ngai_elem2_patch_(g_a, alo, ahi, g_b, blo, bhi,g_c,clo,chi,OP_ELEM_MIN);
2214
void ngai_do_elem3_patch(Integer atype, Integer andim, Integer *loA, Integer *hiA,
2215
Integer *ldA, void *A_ptr, Integer op)
2219
Integer bvalue[MAXDIM], bunit[MAXDIM], baseldA[MAXDIM];
2222
/* number of n-element of the first dimension */
2223
n1dim = 1; for(i=1; i<andim; i++) n1dim *= (hiA[i] - loA[i] + 1);
2225
/* calculate the destination indices */
2226
bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
2227
/* baseld[0] = ld[0]
2228
* baseld[1] = ld[0] * ld[1]
2229
* baseld[2] = ld[0] * ld[1] * ld[2] .....
2231
baseldA[0] = ldA[0]; baseldA[1] = baseldA[0] *ldA[1];
2232
for(i=2; i<andim; i++) {
2234
bunit[i] = bunit[i-1] * (hiA[i-1] - loA[i-1] + 1);
2235
baseldA[i] = baseldA[i-1] * ldA[i];
2238
for(i=0; i<n1dim; i++) {
2240
for(j=1; j<andim; j++) {
2241
idx += bvalue[j] * baseldA[j-1];
2242
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2243
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
2248
tempA=((double*)A_ptr)+idx;
2252
ga_error(" ngai_elem3_patch_: wrong data type ",atype);
2255
tempA=((int*)A_ptr)+idx;
2258
tempA=((float*)A_ptr)+idx;
2261
tempA=((long *)A_ptr)+idx;
2264
default: ga_error(" ngai_elem3_patch_: wrong data type ",atype);
2269
do_stepmax(tempA,hiA[0]-loA[0]+1, atype);
2271
case OP_STEPBOUNDINFO:
2272
do_stepboundinfo(tempA,hiA[0]-loA[0]+1, atype);
2274
default: ga_error(" wrong operation ",op);
2279
static void ngai_elem3_patch_(Integer *g_a, Integer *alo, Integer *ahi, int op)
2280
/*do some preprocess jobs for stepMax and stepMax2*/
2284
Integer andim, adims[MAXDIM];
2285
Integer loA[MAXDIM], hiA[MAXDIM], ldA[MAXDIM];
2286
void *A_ptr, *tempA;
2287
Integer bvalue[MAXDIM], bunit[MAXDIM], baseldA[MAXDIM];
2290
Integer me= ga_nodeid_();
2292
int local_sync_begin,local_sync_end;
2294
local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2295
_ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2296
if(local_sync_begin)ga_sync_();
2298
ga_check_handle(g_a, "gai_elem3_patch_");
2299
GA_PUSH_NAME("ngai_elem3_patch_");
2301
nga_inquire_internal_(g_a, &atype, &andim, adims);
2302
num_blocks = ga_total_blocks_(g_a);
2304
/* check if patch indices and dims match */
2305
for(i=0; i<andim; i++)
2306
if(alo[i] <= 0 || ahi[i] > adims[i])
2307
ga_error("g_a indices out of range ", *g_a);
2309
if (num_blocks < 0) {
2310
/* find out coordinates of patches of g_a, g_b and g_c that I own */
2311
nga_distribution_(g_a, &me, loA, hiA);
2313
/* determine subsets of my patches to access */
2314
if (ngai_patch_intersect(alo, ahi, loA, hiA, andim)){
2315
nga_access_ptr(g_a, loA, hiA, &A_ptr, ldA);
2317
/* compute "local" operation accoording to op */
2318
ngai_do_elem3_patch(atype, andim, loA, hiA, ldA, A_ptr, op);
2320
/* release access to the data */
2321
nga_release_ (g_a, loA, hiA);
2324
Integer offset, j, jtmp, chk;
2325
Integer loS[MAXDIM], nproc;
2326
nproc = ga_nnodes_();
2327
/* using simple block-cyclic data distribution */
2328
if (!ga_uses_proc_grid_(g_a)){
2329
for (i=me; i<num_blocks; i += nproc) {
2330
/* get limits of patch */
2331
nga_distribution_(g_a, &i, loA, hiA);
2333
/* loA is changed by ngai_patch_intersect, so
2335
for (j=0; j<andim; j++) {
2339
/* determine subset of my local patch to access */
2340
/* Output is in loA and hiA */
2341
if(ngai_patch_intersect(alo, ahi, loA, hiA, andim)){
2343
/* get data_ptr to corner of patch */
2344
/* ld are leading dimensions for block */
2345
nga_access_block_ptr(g_a, &i, &A_ptr, ldA);
2347
/* Check for partial overlap */
2349
for (j=0; j<andim; j++) {
2350
if (loS[j] < loA[j]) {
2356
/* Evaluate additional offset for pointer */
2359
for (j=0; j<andim-1; j++) {
2360
offset += (loA[j]-loS[j])*jtmp;
2363
offset += (loA[andim-1]-loS[andim-1])*jtmp;
2366
A_ptr = (void*)((int*)A_ptr + offset);
2369
A_ptr = (void*)((double*)A_ptr + 2*offset);
2372
A_ptr = (void*)((float*)A_ptr + 2*offset);
2375
A_ptr = (void*)((double*)A_ptr + offset);
2378
A_ptr = (void*)((float*)A_ptr + offset);
2381
A_ptr = (void*)((long*)A_ptr + offset);
2383
default: ga_error(" wrong data type ",atype);
2387
/* compute "local" operation accoording to op */
2388
ngai_do_elem3_patch(atype, andim, loA, hiA, ldA, A_ptr, op);
2390
/* release access to the data */
2391
nga_release_update_block_(g_a, &i);
2395
/* using scalapack block-cyclic data distribution */
2396
Integer proc_index[MAXDIM], index[MAXDIM];
2397
Integer topology[MAXDIM];
2398
Integer blocks[MAXDIM], block_dims[MAXDIM];
2399
ga_get_proc_index_(g_a, &me, proc_index);
2400
ga_get_proc_index_(g_a, &me, index);
2401
ga_get_block_info_(g_a, blocks, block_dims);
2402
ga_get_proc_grid_(g_a, topology);
2403
while (index[andim-1] < blocks[andim-1]) {
2404
/* find bounding coordinates of block */
2405
for (i = 0; i < andim; i++) {
2406
loA[i] = index[i]*block_dims[i]+1;
2407
hiA[i] = (index[i] + 1)*block_dims[i];
2408
if (hiA[i] > adims[i]) hiA[i] = adims[i];
2410
/* loA is changed by ngai_patch_intersect, so
2412
for (j=0; j<andim; j++) {
2416
/* determine subset of my local patch to access */
2417
/* Output is in loA and hiA */
2418
if(ngai_patch_intersect(alo, ahi, loA, hiA, andim)){
2420
/* get data_ptr to corner of patch */
2421
/* ld are leading dimensions for block */
2422
nga_access_block_grid_ptr(g_a, index, &A_ptr, ldA);
2424
/* Check for partial overlap */
2426
for (j=0; j<andim; j++) {
2427
if (loS[j] < loA[j]) {
2433
/* Evaluate additional offset for pointer */
2436
for (j=0; j<andim-1; j++) {
2437
offset += (loA[j]-loS[j])*jtmp;
2440
offset += (loA[andim-1]-loS[andim-1])*jtmp;
2443
A_ptr = (void*)((int*)A_ptr + offset);
2446
A_ptr = (void*)((double*)A_ptr + 2*offset);
2449
A_ptr = (void*)((float*)A_ptr + 2*offset);
2452
A_ptr = (void*)((double*)A_ptr + offset);
2455
A_ptr = (void*)((float*)A_ptr + offset);
2458
A_ptr = (void*)((long*)A_ptr + offset);
2460
default: ga_error(" wrong data type ",atype);
2464
/* compute "local" operation accoording to op */
2465
ngai_do_elem3_patch(atype, andim, loA, hiA, ldA, A_ptr, op);
2467
/* release access to the data */
2468
nga_release_update_block_grid_(g_a, index);
2470
/* increment index to get next block on processor */
2471
index[0] += topology[0];
2472
for (i = 0; i < andim; i++) {
2473
if (index[i] >= blocks[i] && i<andim-1) {
2474
index[i] = proc_index[i];
2475
index[i+1] += topology[i+1];
2483
if(local_sync_end)ga_sync_();
2486
void ngai_has_negative_element(Integer atype, Integer andim, Integer *loA, Integer *hiA,
2487
Integer *ldA, void *A_ptr, Integer *iretval)
2490
Integer bvalue[MAXDIM], bunit[MAXDIM], baseldA[MAXDIM];
2497
/* number of n-element of the first dimension */
2498
n1dim = 1; for(i=1; i<andim; i++) n1dim *= (hiA[i] - loA[i] + 1);
2500
/* calculate the destination indices */
2501
bvalue[0] = 0; bvalue[1] = 0; bunit[0] = 1; bunit[1] = 1;
2502
/* baseld[0] = ld[0]
2503
* baseld[1] = ld[0] * ld[1]
2504
* baseld[2] = ld[0] * ld[1] * ld[2] .....
2506
baseldA[0] = ldA[0]; baseldA[1] = baseldA[0] *ldA[1];
2507
for(i=2; i<andim; i++) {
2509
bunit[i] = bunit[i-1] * (hiA[i-1] - loA[i-1] + 1);
2510
baseldA[i] = baseldA[i-1] * ldA[i];
2513
for(i=0; i<n1dim; i++) {
2515
for(j=1; j<andim; j++) {
2516
idx += bvalue[j] * baseldA[j-1];
2517
if(((i+1) % bunit[j]) == 0) bvalue[j]++;
2518
if(bvalue[j] > (hiA[j]-loA[j])) bvalue[j] = 0;
2523
/*double is the only type that is handled for Tao/GA project*/
2525
DJB modification to add types int, float and long.
2526
This operation does not make sense for complex.
2528
tempA=((double*)A_ptr)+idx;
2529
for(j=0;j<hiA[0]-loA[0]+1;j++)
2530
if(tempA[j]<(double)0.0) *iretval=1;
2534
ga_error(" has_negative_elem: wrong data type ",
2538
itempA=((int*)A_ptr)+idx;
2539
for(j=0;j<hiA[0]-loA[0]+1;j++)
2540
if(itempA[j]<(int)0) *iretval=1;
2543
ftempA=((float*)A_ptr)+idx;
2544
for(j=0;j<hiA[0]-loA[0]+1;j++)
2545
if(ftempA[j]<(float)0.0) *iretval=1;
2548
ltempA=((long*)A_ptr)+idx;
2549
for(j=0;j<hiA[0]-loA[0]+1;j++)
2550
if(ltempA[j]<(long)0) *iretval=1;
2553
default: ga_error(" has_negative_elem: wrong data type ",
2560
static Integer has_negative_elem(g_a, alo, ahi)
2561
Integer *g_a, *alo, *ahi; /* patch of g_a */
2562
/*returned value: 1=found; 0 = not found*/
2566
Integer andim, adims[MAXDIM];
2567
Integer loA[MAXDIM], hiA[MAXDIM], ldA[MAXDIM];
2572
Integer me= ga_nodeid_();
2576
ga_check_handle(g_a, "has_negative_elem");
2577
GA_PUSH_NAME("has_negative_elem");
2579
nga_inquire_internal_(g_a, &atype, &andim, adims);
2580
num_blocks = ga_total_blocks_(g_a);
2582
/* check if patch indices and dims match */
2583
for(i=0; i<andim; i++)
2584
if(alo[i] <= 0 || ahi[i] > adims[i])
2585
ga_error("g_a indices out of range ", *g_a);
2587
if (num_blocks < 0) {
2588
/* find out coordinates of patches of g_a, g_b and g_c that I own */
2589
nga_distribution_(g_a, &me, loA, hiA);
2591
/* determine subsets of my patches to access */
2592
if (ngai_patch_intersect(alo, ahi, loA, hiA, andim)){
2593
nga_access_ptr(g_a, loA, hiA, &A_ptr, ldA);
2595
ngai_has_negative_element(atype, andim, loA, hiA, ldA, A_ptr, &iretval);
2597
/* release access to the data */
2598
nga_release_ (g_a, loA, hiA);
2601
Integer offset, j, jtmp, chk;
2602
Integer loS[MAXDIM], nproc;
2603
nproc = ga_nnodes_();
2604
/* using simple block-cyclic data distribution */
2605
if (!ga_uses_proc_grid_(g_a)){
2606
for (i=me; i<num_blocks; i += nproc) {
2607
/* get limits of patch */
2608
nga_distribution_(g_a, &i, loA, hiA);
2610
/* loA is changed by ngai_patch_intersect, so
2612
for (j=0; j<andim; j++) {
2616
/* determine subset of my local patch to access */
2617
/* Output is in loA and hiA */
2618
if(ngai_patch_intersect(alo, ahi, loA, hiA, andim)){
2620
/* get data_ptr to corner of patch */
2621
/* ld are leading dimensions for block */
2622
nga_access_block_ptr(g_a, &i, &A_ptr, ldA);
2624
/* Check for partial overlap */
2626
for (j=0; j<andim; j++) {
2627
if (loS[j] < loA[j]) {
2633
/* Evaluate additional offset for pointer */
2636
for (j=0; j<andim-1; j++) {
2637
offset += (loA[j]-loS[j])*jtmp;
2640
offset += (loA[andim-1]-loS[andim-1])*jtmp;
2643
A_ptr = (void*)((int*)A_ptr + offset);
2646
A_ptr = (void*)((double*)A_ptr + 2*offset);
2649
A_ptr = (void*)((float*)A_ptr + 2*offset);
2652
A_ptr = (void*)((double*)A_ptr + offset);
2655
A_ptr = (void*)((float*)A_ptr + offset);
2658
A_ptr = (void*)((long*)A_ptr + offset);
2660
default: ga_error(" wrong data type ",atype);
2664
/* check all values in patch */
2665
ngai_has_negative_element(atype, andim, loA, hiA, ldA, A_ptr, &iretval);
2667
/* release access to the data */
2668
nga_release_update_block_(g_a, &i);
2672
/* using scalapack block-cyclic data distribution */
2673
Integer proc_index[MAXDIM], index[MAXDIM];
2674
Integer topology[MAXDIM];
2675
Integer blocks[MAXDIM], block_dims[MAXDIM];
2676
ga_get_proc_index_(g_a, &me, proc_index);
2677
ga_get_proc_index_(g_a, &me, index);
2678
ga_get_block_info_(g_a, blocks, block_dims);
2679
ga_get_proc_grid_(g_a, topology);
2680
while (index[andim-1] < blocks[andim-1]) {
2681
/* find bounding coordinates of block */
2682
for (i = 0; i < andim; i++) {
2683
loA[i] = index[i]*block_dims[i]+1;
2684
hiA[i] = (index[i] + 1)*block_dims[i];
2685
if (hiA[i] > adims[i]) hiA[i] = adims[i];
2687
/* loA is changed by ngai_patch_intersect, so
2689
for (j=0; j<andim; j++) {
2693
/* determine subset of my local patch to access */
2694
/* Output is in loA and hiA */
2695
if(ngai_patch_intersect(alo, ahi, loA, hiA, andim)){
2697
/* get data_ptr to corner of patch */
2698
/* ld are leading dimensions for block */
2699
nga_access_block_grid_ptr(g_a, index, &A_ptr, ldA);
2701
/* Check for partial overlap */
2703
for (j=0; j<andim; j++) {
2704
if (loS[j] < loA[j]) {
2710
/* Evaluate additional offset for pointer */
2713
for (j=0; j<andim-1; j++) {
2714
offset += (loA[j]-loS[j])*jtmp;
2717
offset += (loA[andim-1]-loS[andim-1])*jtmp;
2720
A_ptr = (void*)((int*)A_ptr + offset);
2723
A_ptr = (void*)((double*)A_ptr + 2*offset);
2726
A_ptr = (void*)((float*)A_ptr + 2*offset);
2729
A_ptr = (void*)((double*)A_ptr + offset);
2732
A_ptr = (void*)((float*)A_ptr + offset);
2735
A_ptr = (void*)((long*)A_ptr + offset);
2737
default: ga_error(" wrong data type ",atype);
2741
/* check all values in patch */
2742
ngai_has_negative_element(atype, andim, loA, hiA, ldA, A_ptr, &iretval);
2744
/* release access to the data */
2745
nga_release_update_block_grid_(g_a, index);
2748
/* increment index to get next block on processor */
2749
index[0] += topology[0];
2750
for (i = 0; i < andim; i++) {
2751
if (index[i] >= blocks[i] && i<andim-1) {
2752
index[i] = proc_index[i];
2753
index[i+1] += topology[i+1];
2762
return iretval; /*negative element is not found in g_a*/
2766
void FATR ga_step_bound_info_patch_(
2767
Integer *g_xx, Integer *xxlo, Integer *xxhi, /* patch of g_xx */
2768
Integer *g_vv, Integer *vvlo, Integer *vvhi, /* patch of g_vv */
2769
Integer *g_xxll, Integer *xxlllo, Integer *xxllhi, /* patch of g_xxll */
2770
Integer *g_xxuu, Integer *xxuulo, Integer *xxuuhi, /* patch of g_xxuu */
2771
void *boundmin, void* wolfemin, void *boundmax)
2773
double result1,result2;
2774
double dresult,dresult2;
2775
long lresult,lresult2;
2777
Integer index[MAXDIM];
2779
Integer xxndim, xxdims[MAXDIM];
2780
Integer loXX[MAXDIM], hiXX[MAXDIM], ldXX[MAXDIM];
2782
Integer vvndim, vvdims[MAXDIM];
2783
Integer loVV[MAXDIM], hiVV[MAXDIM], ldVV[MAXDIM];
2784
Integer g_XX = *g_xx, g_VV = *g_vv;
2785
Integer xxtotal,vvtotal;
2787
Integer xlndim, xldims[MAXDIM];
2788
Integer loXL[MAXDIM], hiXL[MAXDIM], ldXL[MAXDIM];
2790
Integer xundim, xudims[MAXDIM];
2791
Integer loXU[MAXDIM], hiXU[MAXDIM], ldXU[MAXDIM];
2792
Integer g_XL = *g_xxll, g_XU = *g_xxuu;
2793
Integer xltotal,xutotal;
2794
Integer me= ga_nodeid_();
2796
Integer *g_q = &g_Q;
2798
Integer *g_r = &g_R;
2800
Integer *g_s = &g_S;
2802
Integer *g_t = &g_T;
2803
double dalpha = (double)1.0, dbeta = (double)(-1.0);
2804
long lalpha = (long)1, lbeta = (long)(-1);
2805
int ialpha = (int)1, ibeta = (int)(-1);
2806
float falpha = (float)1.0, fbeta = (float)(-1.0);
2807
int iresult,iresult2;
2808
float fresult,fresult2;
2810
Integer compatible2;
2811
Integer compatible3;
2815
int local_sync_begin,local_sync_end;
2818
local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
2819
_ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
2820
if(local_sync_begin)ga_sync_();
2822
/* Check for valid ga handles. */
2824
ga_check_handle(g_xx, "ga_step_bound_info_patch_");
2825
ga_check_handle(g_vv, "ga_step_bound_info_patch_");
2826
ga_check_handle(g_xxll, "ga_step_bound_info_patch_");
2827
ga_check_handle(g_xxuu, "ga_step_bound_info_patch_");
2829
GA_PUSH_NAME("ga_step_bound_info_patch_");
2831
/* get chaacteristics of the input ga patches */
2833
nga_inquire_internal_(g_xx, &xxtype, &xxndim, xxdims);
2834
nga_inquire_internal_(g_vv, &vvtype, &vvndim, vvdims);
2835
nga_inquire_internal_(g_xxll, &xltype, &xlndim, xldims);
2836
nga_inquire_internal_(g_xxuu, &xutype, &xundim, xudims);
2838
/* Check for matching types. */
2840
if(xxtype != vvtype) ga_error(" ga_step_bound_info_patch_: types mismatch ", 0L);
2841
if(xxtype != xltype) ga_error(" ga_step_bound_info_patch_: types mismatch ", 0L);
2842
if(xxtype != xutype) ga_error(" ga_step_bound_info_patch_: types mismatch ", 0L);
2844
/* check if patch indices and dims match */
2845
for(i=0; i<xxndim; i++)
2846
if(xxlo[i] <= 0 || xxhi[i] > xxdims[i])
2847
ga_error("ga_elem_step_bound_info_patch: g_a indices out of range ", *g_xx);
2849
for(i=0; i<vvndim; i++)
2850
if(vvlo[i] <= 0 || vvhi[i] > vvdims[i])
2851
ga_error("ga_elem_step_bound_info_patch: g_a indices out of range ", *g_vv);
2853
for(i=0; i<xlndim; i++)
2854
if(xxlllo[i] <= 0 || xxllhi[i] > xldims[i])
2855
ga_error("ga_elem_step_bound_info_patch: g_a indices out of range ", *g_xxll);
2856
for(i=0; i<xundim; i++)
2857
if(xxuulo[i] <= 0 || xxuuhi[i] > xudims[i])
2858
ga_error("ga_elem_step_bound_info_patch: g_a indices out of range ", *g_xxuu);
2860
/* check if numbers of elements in patches match each other */
2861
xxtotal = 1; for(i=0; i<xxndim; i++) xxtotal *= (xxhi[i] - xxlo[i] + 1);
2862
vvtotal = 1; for(i=0; i<vvndim; i++) vvtotal *= (vvhi[i] - vvlo[i] + 1);
2863
xltotal = 1; for(i=0; i<xlndim; i++) xltotal *= (xxllhi[i] - xxlllo[i] + 1);
2864
xutotal = 1; for(i=0; i<xundim; i++) xutotal *= (xxuuhi[i] - xxuulo[i] + 1);
2866
if(xxtotal != vvtotal)
2867
ga_error(" ga_step_bound_info_patch_ capacities of patches do not match ", 0L);
2868
if(xxtotal != xltotal)
2869
ga_error(" ga_step_bound_info_patch_ capacities of patches do not match ", 0L);
2870
if(xxtotal != xutotal)
2871
ga_error(" ga_step_bound_info_patch_ capacities of patches do not match ", 0L);
2872
/* find out coordinates of patches of g_a, and g_b that I own */
2873
nga_distribution_(&g_XX, &me, loXX, hiXX);
2874
nga_distribution_(&g_VV, &me, loVV, hiVV);
2875
nga_distribution_(&g_XL, &me, loXL, hiXL);
2876
nga_distribution_(&g_XU, &me, loXU, hiXU);
2879
/* test if the local portion of patches matches */
2880
if(ngai_comp_patch(xxndim, loXX, hiXX, vvndim, loVV, hiVV) &&
2881
ngai_comp_patch(xxndim, xxlo, xxhi, vvndim, vvlo, vvhi)) {
2887
if(ngai_comp_patch(xxndim, loXX, hiXX, xlndim, loXL, hiXL) &&
2888
ngai_comp_patch(xxndim, xxlo, xxhi, xlndim, xxlllo, xxllhi)) {
2894
if(ngai_comp_patch(xxndim, loXX, hiXX, xundim, loXU, hiXU) &&
2895
ngai_comp_patch(xxndim, xxlo, xxhi, xundim, xxuulo, xxuuhi)) {
2901
compatible = compatible * compatible2 * compatible3;
2902
ga_igop(GA_TYPE_GSM, &compatible, 1, "*");
2904
ga_error(" ga_step_bound_info_patch_ mismatched patchs ",0);
2909
/* This should point to iresult but we use lresult
2910
due to the strange implementation if nga_select_elem_.
2913
sresult2 = &iresult2;
2919
ga_error ("Ga_step_bound_info_patch_: unavalable for complex datatype.",
2924
sresult2 = &dresult2;
2930
sresult2 = &fresult2;
2936
sresult2 = &lresult2;
2941
ga_error ("Ga_step_max_patch_: alpha/beta set wrong data type.", xxtype);
2944
/*duplicatecate an array Q to hold the temparary result */
2945
ga_duplicate(g_xx, &g_Q, "TempQ");
2947
ga_error("ga_step_bound_info_patch_:fail to duplicate array Q", g_Q);
2949
/*duplicatecate an array R to hold the temparary result */
2950
ga_duplicate(g_xx, &g_R, "TempR");
2952
ga_error("ga_step_bound_info_patch_:fail to duplicate array R", g_R);
2954
/*duplicatecate an array s to hold the temparary result */
2955
ga_duplicate(g_xx, &g_S, "TempS");
2957
ga_error("ga_step_bound_info_patch_:fail to duplicate array S", g_S);
2959
/*duplicatecate an array T to hold the temparary result */
2960
ga_duplicate(g_xx, &g_T, "TempT");
2962
ga_error("ga_step_bound_info_patch_:fail to duplicate array T", g_T);
2964
/*First, compute xu - xx */
2965
nga_add_patch_(alpha, g_xxuu, xxuulo, xxuuhi, beta, g_xx, xxlo, xxhi,&g_S, xxlo, xxhi);
2967
/*Check for negative elements in g_s, if it has any then xxuu was
2968
not an upper bound, exit with error message.
2970
if(has_negative_elem(&g_S, xxlo, xxhi) == 1)
2971
ga_error("ga_step_bound_info_patch_: Upper bound is not > xx.", -1);
2973
/* Then compute t = positve elements of vv */
2975
ga_elem_maximum_(g_vv,&g_T,&g_T);
2977
/* Then, compute (xu-xx)/vv */
2978
ga_elem_stepb_divide_patch_(&g_S, xxlo, xxhi, &g_T, vvlo, vvhi, &g_T, xxlo, xxhi);
2980
/* Then, we will select the minimum of the array g_t*/
2981
nga_select_elem_(&g_T, "min", sresult, &index[0]);
2986
/* This should be iresult but is lresult because of
2987
the strange implementation of nga_select_elem.
2989
result1 = (double)(iresult);
2993
ga_error ("Ga_step_bound_info_patch_: unavalable for complex datatype.",
3000
result1 = (double)fresult;
3003
result1 = (double)lresult;
3006
ga_error ("Ga_step_bound_info_patch_: result set: wrong data type.", xxtype);
3009
/*Now doing the same thing to get (xx-xxll)/dv */
3010
/*First, compute xl - xx */
3011
nga_add_patch_(alpha, g_xx, xxlo, xxhi, beta, g_xxll, xxlllo, xxllhi, &g_Q, xxlo, xxhi);
3012
/*Check for negative elements in g_s, if it has any then xxll was
3013
not a lower bound, exit with error message.
3015
if(has_negative_elem(&g_Q, xxlo, xxhi) == 1)
3016
ga_error("ga_step_bound_info_patch_: Lower bound is not < xx.", -1);
3018
/* Then compute r = negative elements of vv */
3020
ga_elem_minimum_(g_vv,&g_R,&g_R);
3021
ga_abs_value_(&g_R);
3023
/* Then, compute (xx-xl)/vv */
3024
ga_elem_stepb_divide_patch_(&g_Q, xxlo, xxhi, &g_R, vvlo, vvhi, &g_R, xxlo, xxhi);
3025
/* Then, we will select the minimum of the array g_t*/
3026
nga_select_elem_(&g_R, "min", sresult2, &index[0]);
3030
*(int*)wolfemin = GA_ABS(GA_MIN(iresult,iresult2));
3034
ga_error ("Ga_step_bound_info_patch_: unavalable for complex datatype.",
3038
*(double*)wolfemin = GA_ABS(GA_MIN(dresult,dresult2));
3041
*(float*)wolfemin = GA_ABS(GA_MIN(fresult,fresult2));
3044
*(long*)wolfemin = GA_ABS(GA_MIN(lresult,lresult2));
3047
ga_error ("Ga_step_bound_info_patch_: result2 set: wrong data type.", xxtype);
3050
Now set T to be the elementwise minimum of R and T.
3051
So, T is infinity only where ever g_vv is zero.
3053
ga_elem_minimum_(&g_R,&g_T,&g_T);
3055
Now we want to set T to be zero whenever g_vv was zero
3056
and gxx coincides with either boundary vector.
3057
Set S to be the element-wise product of S and Q.
3058
It will be zero when either of them is zero.
3060
ga_elem_multiply_(&g_Q,&g_S,&g_S);
3064
ga_copy_(g_vv,&g_Q);
3065
ga_abs_value_(&g_Q);
3067
Now add q and s to get a vector that is zero only
3068
where g_vv was zero and g_xx meets one of the
3071
nga_add_patch_(alpha, &g_Q, xxlo, xxhi, alpha, &g_S, xxlo, xxhi, &g_S, xxlo, xxhi);
3073
Then use that vector as a mask to set certain
3074
elements of T to be zero (so we have a collection
3075
of the a_i and c_i elements as per the TAO StepBoundInfo
3078
ga_step_mask_patch_(&g_S,xxlo,xxhi,&g_T,xxlo,xxhi,&g_T,xxlo,xxhi);
3081
Then, we will select the minimum of the array g_t, that will
3084
nga_select_elem_(&g_T, "min", sresult, &index[0]);
3088
/* This should be iresult but is lresult because of
3089
the strange implementation of nga_select_elem.
3091
*(int*)boundmin = iresult;
3095
ga_error ("Ga_step_bound_info_patch_: unavalable for complex datatype.",
3099
*(double*)boundmin = dresult;
3102
*(float*)boundmin = fresult;
3105
*(long*)boundmin = lresult;
3108
ga_error ("Ga_step_bound_info_patch_: result set: wrong data type.", xxtype);
3111
Then, we will select the maximum of the array g_t, that will
3114
nga_select_elem_(&g_T, "max", sresult, &index[0]);
3118
/* This should be iresult but is lresult because of
3119
the strange implementation of nga_select_elem.
3121
*(int*)boundmax = iresult;
3125
ga_error ("Ga_step_bound_info_patch_: unavalable for complex datatype.",
3129
*(double*)boundmax = dresult;
3132
*(float*)boundmax = fresult;
3135
*(long*)boundmax = lresult;
3138
ga_error ("Ga_step_bound_info_patch_: result set: wrong data type.", xxtype);
3145
if(local_sync_end)ga_sync_();
3148
/*\ generic routine for element wise operation between two array
3150
#if 0 /* I want to delete op parameter */
3151
void ga_step_max_patch_(g_a, alo, ahi, g_b, blo, bhi, result, op)
3155
void FATR ga_step_max_patch_(g_a, alo, ahi, g_b, blo, bhi, result)
3156
Integer *g_a, *alo, *ahi; /* patch of g_a */
3157
Integer *g_b, *blo, *bhi; /* patch of g_b */
3160
Integer op; /* operations */
3167
Integer andim, adims[MAXDIM];
3168
Integer loA[MAXDIM], hiA[MAXDIM], ldA[MAXDIM];
3170
Integer bndim, bdims[MAXDIM];
3171
Integer loB[MAXDIM], hiB[MAXDIM], ldB[MAXDIM];
3172
Integer index[MAXDIM];
3173
Integer num_blocks_a, num_blocks_b;
3174
/* double result = -1; */
3177
Integer g_A = *g_a, g_B = *g_b;
3179
Integer atotal,btotal;
3180
Integer me= ga_nodeid_();
3182
int local_sync_begin,local_sync_end;
3187
local_sync_begin = _ga_sync_begin; local_sync_end = _ga_sync_end;
3188
_ga_sync_begin = 1; _ga_sync_end=1; /*remove any previous masking*/
3189
if(local_sync_begin)ga_sync_();
3191
/* Check for valid ga handles. */
3193
ga_check_handle(g_a, "ga_step_max_patch_");
3194
ga_check_handle(g_b, "ga_step_max_patch_");
3196
GA_PUSH_NAME("ga_step_max_patch_");
3198
/* get chacteristics of the input ga patches */
3200
nga_inquire_internal_(g_a, &atype, &andim, adims);
3201
nga_inquire_internal_(g_b, &btype, &bndim, bdims);
3202
num_blocks_a = ga_total_blocks_(g_a);
3203
num_blocks_b = ga_total_blocks_(g_b);
3205
/* Check for matching types. */
3206
if(atype != btype) ga_error(" ga_step_max_patch_: types mismatch ", 0L);
3208
/* check if patch indices and dims match */
3209
for(i=0; i<andim; i++)
3210
if(alo[i] <= 0 || ahi[i] > adims[i])
3211
ga_error("g_a indices out of range ", *g_a);
3212
for(i=0; i<bndim; i++)
3213
if(blo[i] <= 0 || bhi[i] > bdims[i])
3214
ga_error("g_b indices out of range ", *g_b);
3216
/* check if numbers of elements in patches match each other */
3217
atotal = 1; for(i=0; i<andim; i++) atotal *= (ahi[i] - alo[i] + 1);
3218
btotal = 1; for(i=0; i<bndim; i++) btotal *= (bhi[i] - blo[i] + 1);
3220
if(btotal != atotal)
3221
ga_error(" ga_step_max_patch_ capacities of patches do not match ", 0L);
3223
/* test if patches match */
3224
if(ngai_comp_patch(andim, alo, ahi, bndim, blo, bhi)) compatible = 1;
3225
else compatible = 0;
3226
ga_igop(GA_TYPE_GSM, &compatible, 1, "*");
3228
ga_error(" ga_step_max_patch_ mismatched patchs ",0);
3238
ga_error ("Ga_step_max_patch_: unavalable for complex datatype.",
3251
ga_error ("Ga_step_max_patch_: wrong data type.", atype);
3255
/* It used to say 1, but if ga and gb are the same, and
3256
ga is nonnegative then any number of multiples of gb
3257
can be added to ga still leaving it nonnegative.
3258
*result = (double)1.0;
3263
*(int*)result = GA_INFINITY_I;
3267
ga_error ("Ga_step_max_patch_: unavailable for complex datatype.",
3271
*(double*)result = GA_INFINITY_D;
3274
*(float*)result = GA_INFINITY_F;
3277
*(long*)result = GA_INFINITY_L;
3280
ga_error ("Ga_step_max_patch_: wrong data type.", atype);
3283
/*Now look at each element of the array g_a.
3284
If an element of g_a is negative, then simply return */
3285
if(has_negative_elem(g_a, alo, ahi) == 1)
3286
ga_error("ga_step_max_patch_: g_a has negative element.", -1);
3288
/*duplicate an array c to hold the temparate result = g_a/g_b; */
3289
ga_duplicate(g_a, &g_C, "Temp");
3291
ga_error("ga_step_max_patch_:fail to duplicate array c", *g_a);
3295
ga_elem_divide_patch_(g_a, alo, ahi, g_b, blo, bhi, g_c, alo, ahi);
3297
ga_elem_step_divide_patch_(g_a, alo, ahi, g_b, blo, bhi,
3300
/*Now look at each element of the array g_c. If an element of g_c is positive,
3301
then replace it with -GA_INFINITY */
3302
ngai_elem3_patch_(g_c, alo, ahi, OP_STEPMAX);
3303
/*Then, we will select the maximum of the array g_c*/
3304
nga_select_elem_(g_c, "max", sresult, index);
3308
*(int*)result = GA_ABS(iresult);
3312
ga_error ("Ga_step_max_patch_: unavailable for complex datatype.",
3316
*(double*)result = GA_ABS(dresult);
3319
*(float*)result = GA_ABS(fresult);
3322
*(long*)result = GA_ABS(lresult);
3325
ga_error ("Ga_step_max_patch_: wrong data type.", atype);
3330
if(local_sync_end)ga_sync_();
3334
void FATR ga_step_max_(Integer *g_a, Integer *g_b, void *retval)
3336
Integer atype, andim;
3337
Integer btype, bndim;
3338
Integer alo[MAXDIM],ahi[MAXDIM];
3339
Integer blo[MAXDIM],bhi[MAXDIM];
3341
nga_inquire_internal_(g_a, &atype, &andim, ahi);
3342
nga_inquire_internal_(g_b, &btype, &bndim, bhi);
3351
ga_step_max_patch_(g_a, alo, ahi, g_b, blo, bhi, retval, OP_STEPMAX);
3353
ga_step_max_patch_(g_a, alo, ahi, g_b, blo, bhi, retval);
3357
void FATR ga_step_bound_info_(Integer *g_xx, Integer *g_vv, Integer *g_xxll,
3358
Integer *g_xxuu, void *boundmin, void *wolfemin,
3361
Integer xxtype, xxndim;
3362
Integer vvtype, vvndim;
3363
Integer xxlltype, xxllndim;
3364
Integer xxuutype, xxuundim;
3365
Integer xxlo[MAXDIM],xxhi[MAXDIM];
3366
Integer vvlo[MAXDIM],vvhi[MAXDIM];
3367
Integer xxlllo[MAXDIM],xxllhi[MAXDIM];
3368
Integer xxuulo[MAXDIM],xxuuhi[MAXDIM];
3370
nga_inquire_internal_(g_xx, &xxtype, &xxndim, xxhi);
3371
nga_inquire_internal_(g_vv, &vvtype, &vvndim, vvhi);
3372
nga_inquire_internal_(g_xxll, &xxlltype, &xxllndim, xxllhi);
3373
nga_inquire_internal_(g_xxuu, &xxuutype, &xxuundim, xxuuhi);
3379
xxlllo[xxllndim-1]=1;
3381
xxuulo[xxuundim-1]=1;
3385
ga_step_bound_info_patch_(g_xx,xxlo,xxhi, g_vv,vvlo,vvhi, g_xxll,xxlllo,xxllhi, g_xxuu,xxuulo,xxuuhi, boundmin, wolfemin, boundmax);