1
/***************************************************************************
5
The following is a notice of limited availability of the code, and disclaimer
6
which must be included in the prologue of the code and in all source listings
10
+ 2009 University of Chicago
12
Permission is hereby granted to use, reproduce, prepare derivative works, and
13
to redistribute to others. This software was authored by:
16
Leadership Computing Facility
17
Argonne National Laboratory
20
e-mail: jhammond@anl.gov
24
Portions of this material resulted from work developed under a U.S.
25
Government Contract and are subject to the following license: the Government
26
is granted for itself and others acting on its behalf a paid-up, nonexclusive,
27
irrevocable worldwide license in this computer software to reproduce, prepare
28
derivative works, and perform publicly and display publicly.
32
This computer code material was prepared, in part, as an account of work
33
sponsored by an agency of the United States Government. Neither the United
34
States, nor the University of Chicago, nor any of their employees, makes any
35
warranty express or implied, or assumes any legal liability or responsibility
36
for the accuracy, completeness, or usefulness of any information, apparatus,
37
product, or process disclosed, or represents that its use would not infringe
38
privately owned rights.
40
***************************************************************************/
42
/***********************************************************************
43
* accumulate operation for the following datatypes:
44
* real, double precision, complex, double complex, integer
46
* WARNING: This file must be compiled WITH optimization under AIX.
47
* IBM fortran compilers generate bad code with -g option.
49
* Two versions of each routine are provided:
50
* original and unrolled loops.
52
***********************************************************************/
60
subroutine d_accumulate_1d(alpha, A, B, rows)
62
double precision A(*), B(*), alpha
63
ccdir$ no_cache_alloc a,b
65
A(r) = A(r)+ alpha*B(r)
70
void c_d_accumulate_1d_(const double* const restrict alpha,
72
const double* const restrict B,
73
const int* const restrict rows)
76
for ( i = 0 ; i < (*rows) ; i++ ){
77
A[i] += (*alpha) * B[i];
84
subroutine d_accumulate_2d(alpha, rows, cols, A, ald, B, bld)
86
integer c, r, ald, bld
87
double precision A(ald,*), B(bld,*), alpha
88
ccdir$ no_cache_alloc a,b
91
A(r,c) = A(r,c)+ alpha*B(r,c)
97
void c_d_accumulate_2d_(const double* const alpha,
98
const int* const rows,
99
const int* const cols,
101
const int* const ald,
102
const double* const B,
103
const int* const bld)
106
for ( c = 0 ; c < (*cols) ; c++ ){
107
for ( r = 0 ; r < (*rows) ; r++ ){
108
A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ];
115
subroutine f_accumulate_1d(alpha, A, B, rows)
117
real A(*), B(*), alpha
119
A(r) = A(r)+ alpha*B(r)
124
void c_f_accumulate_1d_(const float* const restrict alpha,
125
float* const restrict A,
126
const float* const restrict B,
127
const int* const restrict rows)
130
for ( i = 0 ; i < (*rows) ; i++ ){
131
A[i] += (*alpha) * B[i];
137
subroutine f_accumulate_2d(alpha, rows, cols, A, ald, B, bld)
139
integer c, r, ald, bld
140
real A(ald,*), B(bld,*), alpha
143
A(r,c) = A(r,c)+ alpha*B(r,c)
149
void c_f_accumulate_2d_(const float* const alpha,
150
const int* const rows,
151
const int* const cols,
153
const int* const ald,
154
const float* const B,
155
const int* const bld)
158
for ( c = 0 ; c < (*cols) ; c++ ){
159
for ( r = 0 ; r < (*rows) ; r++ ){
160
A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ];
167
subroutine z_accumulate_1d(alpha, A, B, rows)
169
double complex A(*), B(*), alpha
171
A(r) = A(r)+ alpha*B(r)
176
void c_c_accumulate_1d_(const complex_t* const restrict alpha,
177
complex_t* const restrict A,
178
const complex_t* const restrict B,
179
const int* const restrict rows)
182
for ( i = 0 ; i < (*rows) ; i++ ){
183
A[i].real += (*alpha).real * B[i].real - (*alpha).imag * B[i].imag;
184
A[i].imag += (*alpha).imag * B[i].real + (*alpha).real * B[i].imag;
190
subroutine z_accumulate_2d(alpha, rows, cols, A, ald, B, bld)
192
integer c, r, ald, bld
193
double complex A(ald,*), B(bld,*), alpha
196
A(r,c) = A(r,c)+ alpha*B(r,c)
202
void c_c_accumulate_2d_(const complex_t* const alpha,
203
const int* const rows,
204
const int* const cols,
205
complex_t* restrict A,
206
const int* const ald,
207
const complex_t* const B,
208
const int* const bld)
211
for ( c = 0 ; c < (*cols) ; c++ ) {
212
for ( r = 0 ; r < (*rows) ; r++ ) {
213
A[ c * (*ald) + r ].real += (*alpha).real * B[ c * (*bld) + r ].real - (*alpha).imag * B[ c * (*bld) + r ].imag;
214
A[ c * (*ald) + r ].imag += (*alpha).imag * B[ c * (*bld) + r ].real + (*alpha).real * B[ c * (*bld) + r ].imag;
221
subroutine c_accumulate_1d(alpha, A, B, rows)
223
complex A(*), B(*), alpha
225
A(r) = A(r)+ alpha*B(r)
230
void c_z_accumulate_1d_(const dcomplex_t* const restrict alpha,
231
dcomplex_t* const restrict A,
232
const dcomplex_t* const restrict B,
233
const int* const restrict rows)
236
for ( i = 0 ; i < (*rows) ; i++ ){
237
A[i].real += (*alpha).real * B[i].real - (*alpha).imag * B[i].imag;
238
A[i].imag += (*alpha).imag * B[i].real + (*alpha).real * B[i].imag;
244
subroutine c_accumulate_2d(alpha, rows, cols, A, ald, B, bld)
246
integer c, r, ald, bld
247
complex A(ald,*), B(bld,*), alpha
250
A(r,c) = A(r,c)+ alpha*B(r,c)
256
void c_z_accumulate_2d_(const dcomplex_t* const alpha,
257
const int* const rows,
258
const int* const cols,
259
dcomplex_t* restrict A,
260
const int* const ald,
261
const dcomplex_t* const B,
262
const int* const bld)
265
for ( c = 0 ; c < (*cols) ; c++ ) {
266
for ( r = 0 ; r < (*rows) ; r++ ) {
267
A[ c * (*ald) + r ].real += (*alpha).real * B[ c * (*bld) + r ].real - (*alpha).imag * B[ c * (*bld) + r ].imag;
268
A[ c * (*ald) + r ].imag += (*alpha).imag * B[ c * (*bld) + r ].real + (*alpha).real * B[ c * (*bld) + r ].imag;
275
subroutine i_accumulate_2d(alpha, rows, cols, A, ald, B, bld)
277
integer c, r, ald, bld
278
integer A(ald,*), B(bld,*), alpha
281
A(r,c) = A(r,c)+ alpha*B(r,c)
287
void c_i_accumulate_1d_(const int* const restrict alpha,
288
int* const restrict A,
289
const int* const restrict B,
290
const int* const restrict rows)
293
for ( i = 0 ; i < (*rows) ; i++ ){
294
A[i] += (*alpha) * B[i];
299
void c_l_accumulate_1d_(const long* const restrict alpha,
300
long* const restrict A,
301
const long* const restrict B,
302
const int* const restrict rows)
305
for ( i = 0 ; i < (*rows) ; i++ ){
306
A[i] += (*alpha) * B[i];
311
void c_ll_accumulate_1d_(const long long* const restrict alpha,
312
long long* const restrict A,
313
const long long* const restrict B,
314
const int* const restrict rows)
317
for ( i = 0 ; i < (*rows) ; i++ ){
318
A[i] += (*alpha) * B[i];
324
subroutine i_accumulate_1d(alpha, A, B, rows)
326
integer A(*), B(*), alpha
328
A(r) = A(r)+ alpha*B(r)
333
void c_i_accumulate_2d_(const int* const alpha,
334
const int* const rows,
335
const int* const cols,
337
const int* const ald,
339
const int* const bld)
342
for ( c = 0 ; c < (*cols) ; c++ ){
343
for ( r = 0 ; r < (*rows) ; r++ ){
344
A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ];
350
void c_l_accumulate_2d_(const long* const alpha,
351
const int* const rows,
352
const int* const cols,
354
const int* const ald,
356
const int* const bld)
359
for ( c = 0 ; c < (*cols) ; c++ ){
360
for ( r = 0 ; r < (*rows) ; r++ ){
361
A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ];
367
void c_ll_accumulate_2d_(const long long* const alpha,
368
const int* const rows,
369
const int* const cols,
370
long long* restrict A,
371
const int* const ald,
372
const long long* const B,
373
const int* const bld)
376
for ( c = 0 ; c < (*cols) ; c++ ){
377
for ( r = 0 ; r < (*rows) ; r++ ){
378
A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ];
385
subroutine d_accumulate_2d_u(alpha, rows, cols, A, ald, B, bld)
387
integer c, r, ald, bld
388
double precision A(ald,*), B(bld,*), alpha
390
doubleprecision d1, d2, d3, d4
392
r1 = iand(max0(rows,0),3)
394
a(r,c) = a(r,c) + alpha*b(r,c)
396
do r = r1 + 1, rows, 4
397
d1 = a(r,c) + alpha*b(r,c)
398
d2 = a(r+1,c) + alpha*b(r+1,c)
399
d3 = a(r+2,c) + alpha*b(r+2,c)
400
d4 = a(r+3,c) + alpha*b(r+3,c)
410
void c_d_accumulate_2d_u_(const double* const alpha,
411
const int* const rows,
412
const int* const cols,
414
const int* const ald,
415
const double* const B,
416
const int* const bld)
419
int m = (*rows) - ((*rows)%4);
420
for ( c = 0 ; c < (*cols) ; c++ ){
421
for ( r = 0 ; r < m ; r+=4 ){
422
A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ];
423
A[ c * (*ald) + r+1 ] += (*alpha) * B[ c * (*bld) + r+1 ];
424
A[ c * (*ald) + r+2 ] += (*alpha) * B[ c * (*bld) + r+2 ];
425
A[ c * (*ald) + r+3 ] += (*alpha) * B[ c * (*bld) + r+3 ];
427
for ( r = m ; r < (*rows) ; r++ ){
428
A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ];
435
subroutine f_accumulate_2d_u(alpha, rows, cols, A, ald, B, bld)
437
integer c, r, ald, bld
438
real A(ald,*), B(bld,*), alpha
442
r1 = iand(max0(rows,0),3)
444
a(r,c) = a(r,c) + alpha*b(r,c)
446
do r = r1 + 1, rows, 4
447
d1 = a(r,c) + alpha*b(r,c)
448
d2 = a(r+1,c) + alpha*b(r+1,c)
449
d3 = a(r+2,c) + alpha*b(r+2,c)
450
d4 = a(r+3,c) + alpha*b(r+3,c)
460
void c_f_accumulate_2d_u_(const float* const alpha,
461
const int* const rows,
462
const int* const cols,
464
const int* const ald,
465
const float* const B,
466
const int* const bld)
469
int m = (*rows) - ((*rows)%4);
470
for ( c = 0 ; c < (*cols) ; c++ ){
471
for ( r = 0 ; r < m ; r+=4 ){
472
A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ];
473
A[ c * (*ald) + r+1 ] += (*alpha) * B[ c * (*bld) + r+1 ];
474
A[ c * (*ald) + r+2 ] += (*alpha) * B[ c * (*bld) + r+2 ];
475
A[ c * (*ald) + r+3 ] += (*alpha) * B[ c * (*bld) + r+3 ];
477
for ( r = m ; r < (*rows) ; r++ ){
478
A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ];
485
subroutine z_accumulate_2d_u(alpha, rows, cols, A, ald, B, bld)
487
integer c, r, ald, bld
488
double complex A(ald,*), B(bld,*), alpha
490
double complex x1, x2, x3, x4
492
r1 = iand(max0(rows,0),3)
494
a(r,c) = a(r,c) + alpha*b(r,c)
496
do r = r1 + 1, rows, 4
497
x1 = a(r,c) + alpha*b(r,c)
498
x2 = a(r+1,c) + alpha*b(r+1,c)
499
x3 = a(r+2,c) + alpha*b(r+2,c)
500
x4 = a(r+3,c) + alpha*b(r+3,c)
510
void c_c_accumulate_2d_u_(const complex_t* const alpha,
511
const int* const rows,
512
const int* const cols,
513
complex_t* restrict A,
514
const int* const ald,
515
const complex_t* const B,
516
const int* const bld)
520
int m = (*rows) - ((*rows)%4);
521
for ( c = 0 ; c < (*cols) ; c++ ){
522
for ( r = 0 ; r < m ; r+=4 ){
525
A[ jA ].real += (*alpha).real * B[ jB ].real - (*alpha).imag * B[ jB ].imag;
526
A[ jA ].imag += (*alpha).imag * B[ jB ].real + (*alpha).real * B[ jB ].imag;
527
A[ jA+1 ].real += (*alpha).real * B[ jB+1 ].real - (*alpha).imag * B[ jB+1 ].imag;
528
A[ jA+1 ].imag += (*alpha).imag * B[ jB+1 ].real + (*alpha).real * B[ jB+1 ].imag;
529
A[ jA+2 ].real += (*alpha).real * B[ jB+2 ].real - (*alpha).imag * B[ jB+2 ].imag;
530
A[ jA+2 ].imag += (*alpha).imag * B[ jB+2 ].real + (*alpha).real * B[ jB+2 ].imag;
531
A[ jA+3 ].real += (*alpha).real * B[ jB+3 ].real - (*alpha).imag * B[ jB+3 ].imag;
532
A[ jA+3 ].imag += (*alpha).imag * B[ jB+3 ].real + (*alpha).real * B[ jB+3 ].imag;
534
for ( r = m ; r < (*rows) ; r++ ){
535
A[ c * (*ald) + r ].real += (*alpha).real * B[ c * (*bld) + r ].real - (*alpha).imag * B[ c * (*bld) + r ].imag;
536
A[ c * (*ald) + r ].imag += (*alpha).imag * B[ c * (*bld) + r ].real + (*alpha).real * B[ c * (*bld) + r ].imag;
543
subroutine c_accumulate_2d_u(alpha, rows, cols, A, ald, B, bld)
545
integer c, r, ald, bld
546
complex A(ald,*), B(bld,*), alpha
548
complex x1, x2, x3, x4
550
r1 = iand(max0(rows,0),3)
552
a(r,c) = a(r,c) + alpha*b(r,c)
554
do r = r1 + 1, rows, 4
555
x1 = a(r,c) + alpha*b(r,c)
556
x2 = a(r+1,c) + alpha*b(r+1,c)
557
x3 = a(r+2,c) + alpha*b(r+2,c)
558
x4 = a(r+3,c) + alpha*b(r+3,c)
568
void c_z_accumulate_2d_u_(const dcomplex_t* const alpha,
569
const int* const rows,
570
const int* const cols,
571
dcomplex_t* restrict A,
572
const int* const ald,
573
const dcomplex_t* const B,
574
const int* const bld)
578
int m = (*rows) - ((*rows)%4);
579
for ( c = 0 ; c < (*cols) ; c++ ){
580
for ( r = 0 ; r < m ; r+=4 ){
583
A[ jA ].real += (*alpha).real * B[ jB ].real - (*alpha).imag * B[ jB ].imag;
584
A[ jA ].imag += (*alpha).imag * B[ jB ].real + (*alpha).real * B[ jB ].imag;
585
A[ jA+1 ].real += (*alpha).real * B[ jB+1 ].real - (*alpha).imag * B[ jB+1 ].imag;
586
A[ jA+1 ].imag += (*alpha).imag * B[ jB+1 ].real + (*alpha).real * B[ jB+1 ].imag;
587
A[ jA+2 ].real += (*alpha).real * B[ jB+2 ].real - (*alpha).imag * B[ jB+2 ].imag;
588
A[ jA+2 ].imag += (*alpha).imag * B[ jB+2 ].real + (*alpha).real * B[ jB+2 ].imag;
589
A[ jA+3 ].real += (*alpha).real * B[ jB+3 ].real - (*alpha).imag * B[ jB+3 ].imag;
590
A[ jA+3 ].imag += (*alpha).imag * B[ jB+3 ].real + (*alpha).real * B[ jB+3 ].imag;
592
for ( r = m ; r < (*rows) ; r++ ){
593
A[ c * (*ald) + r ].real += (*alpha).real * B[ c * (*bld) + r ].real - (*alpha).imag * B[ c * (*bld) + r ].imag;
594
A[ c * (*ald) + r ].imag += (*alpha).imag * B[ c * (*bld) + r ].real + (*alpha).real * B[ c * (*bld) + r ].imag;
601
subroutine i_accumulate_2d_u(alpha, rows, cols, A, ald, B, bld)
603
integer c, r, ald, bld
604
integer A(ald,*), B(bld,*), alpha
606
integer r1, j2, j3, j4, j5
608
r1 = iand(max0(rows,0),3)
610
a(r,c) = a(r,c) + alpha*b(r,c)
612
do r = r1 + 1, rows, 4
613
j2 = a(r,c) + alpha*b(r,c)
614
j3 = a(r+1,c) + alpha*b(r+1,c)
615
j4 = a(r+2,c) + alpha*b(r+2,c)
616
j5 = a(r+3,c) + alpha*b(r+3,c)
626
void c_i_accumulate_2d_u_(const int* const alpha,
627
const int* const rows,
628
const int* const cols,
630
const int* const ald,
632
const int* const bld)
635
int m = (*rows) - ((*rows)%4);
636
for ( c = 0 ; c < (*cols) ; c++ ){
637
for ( r = 0 ; r < m ; r+=4 ){
638
A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ];
639
A[ c * (*ald) + r+1 ] += (*alpha) * B[ c * (*bld) + r+1 ];
640
A[ c * (*ald) + r+2 ] += (*alpha) * B[ c * (*bld) + r+2 ];
641
A[ c * (*ald) + r+3 ] += (*alpha) * B[ c * (*bld) + r+3 ];
643
for ( r = m ; r < (*rows) ; r++ ){
644
A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ];
650
void c_l_accumulate_2d_u_(const long* const alpha,
651
const int* const rows,
652
const int* const cols,
654
const int* const ald,
656
const int* const bld)
659
int m = (*rows) - ((*rows)%4);
660
for ( c = 0 ; c < (*cols) ; c++ ){
661
for ( r = 0 ; r < m ; r+=4 ){
662
A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ];
663
A[ c * (*ald) + r+1 ] += (*alpha) * B[ c * (*bld) + r+1 ];
664
A[ c * (*ald) + r+2 ] += (*alpha) * B[ c * (*bld) + r+2 ];
665
A[ c * (*ald) + r+3 ] += (*alpha) * B[ c * (*bld) + r+3 ];
667
for ( r = m ; r < (*rows) ; r++ ){
668
A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ];
674
void c_ll_accumulate_2d_u_(const long long* const alpha,
675
const int* const rows,
676
const int* const cols,
677
long long* restrict A,
678
const int* const ald,
679
const long long* const B,
680
const int* const bld)
683
int m = (*rows) - ((*rows)%4);
684
for ( c = 0 ; c < (*cols) ; c++ ){
685
for ( r = 0 ; r < m ; r+=4 ){
686
A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ];
687
A[ c * (*ald) + r+1 ] += (*alpha) * B[ c * (*bld) + r+1 ];
688
A[ c * (*ald) + r+2 ] += (*alpha) * B[ c * (*bld) + r+2 ];
689
A[ c * (*ald) + r+3 ] += (*alpha) * B[ c * (*bld) + r+3 ];
691
for ( r = m ; r < (*rows) ; r++ ){
692
A[ c * (*ald) + r ] += (*alpha) * B[ c * (*bld) + r ];
699
c---------- operations used in armci gops --------------
701
subroutine fort_dadd(n, x, work)
703
double precision x(n), work(n)
705
x(i) = x(i) + work(i)
710
void c_dadd_(const int* const restrict n,
711
double* const restrict x,
712
const double* const restrict work)
715
for ( i = 0 ; i < (*n) ; i++ ){
722
subroutine fort_dadd2(n, x, work, work2)
724
double precision x(n), work(n), work2(n)
726
x(i) = work(i) + work2(i)
731
void c_dadd2_(const int* const restrict n,
732
double* const restrict x,
733
const double* const restrict work,
734
const double* const restrict work2)
737
for ( i = 0 ; i < (*n) ; i++ ){
738
x[i] = work[i] + work2[i];
744
subroutine fort_dmult(n, x, work)
746
double precision x(n), work(n)
748
x(i) = x(i) * work(i)
753
void c_dmult_(const int* const restrict n,
754
double* const restrict x,
755
const double* const restrict work)
758
for ( i = 0 ; i < (*n) ; i++ ){
765
subroutine fort_dmult2(n, x, work,work2)
767
double precision x(n), work(n)
769
x(i) = work(i)*work2(i)
774
void c_dmult2_(const int* const restrict n,
775
double* const restrict x,
776
const double* const restrict work,
777
const double* const restrict work2)
780
for ( i = 0 ; i < (*n) ; i++ ){
781
x[i] = work[i] * work2[i];