1
/* ---------------------------------------------------------------------
3
* -- Automatically Tuned Linear Algebra Software (ATLAS)
4
* (C) Copyright 2000 All Rights Reserved
6
* -- ATLAS routine -- Version 3.2 -- December 25, 2000
8
* Author : Antoine P. Petitet
9
* Contributor(s) : R. Clint Whaley
10
* Originally developed at the University of Tennessee,
11
* Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA.
13
* ---------------------------------------------------------------------
15
* -- Copyright notice and Licensing terms:
17
* Redistribution and use in source and binary forms, with or without
18
* modification, are permitted provided that the following conditions
21
* 1. Redistributions of source code must retain the above copyright
22
* notice, this list of conditions and the following disclaimer.
23
* 2. Redistributions in binary form must reproduce the above copyright
24
* notice, this list of conditions, and the following disclaimer in
25
* the documentation and/or other materials provided with the distri-
27
* 3. The name of the University, the ATLAS group, or the names of its
28
* contributors may not be used to endorse or promote products deri-
29
* ved from this software without specific written permission.
33
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
34
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
35
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
36
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
37
* OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
38
* CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
39
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
40
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO-
41
* RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN-
42
* CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
43
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45
* ---------------------------------------------------------------------
2
* Automatically Tuned Linear Algebra Software v3.10.1
3
* Copyright (C) 2010, 2012 R. Clint Whaley
5
* Code contributers : R. Clint Whaley, Antoine P. Petitet
7
* Redistribution and use in source and binary forms, with or without
8
* modification, are permitted provided that the following conditions
10
* 1. Redistributions of source code must retain the above copyright
11
* notice, this list of conditions and the following disclaimer.
12
* 2. Redistributions in binary form must reproduce the above copyright
13
* notice, this list of conditions, and the following disclaimer in the
14
* documentation and/or other materials provided with the distribution.
15
* 3. The name of the ATLAS group or the names of its contributers may
16
* not be used to endorse or promote products derived from this
17
* software without specific written permission.
19
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
23
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29
* POSSIBILITY OF SUCH DAMAGE.
50
32
#include "atlas_misc.h"
51
33
#include "atlas_level1.h"
52
34
#include "atlas_kernel2.h"
53
35
#include "atlas_lvl2.h"
57
37
#include "atlas_reflvl2.h" /* temporary for building purposes */
58
38
#include "atlas_reflevel2.h" /* used for gbmv, gpmv and gpr. */
60
void Mjoin( PATL, hemv )
62
const enum ATLAS_UPLO UPLO,
39
#include Mstr(Mjoin(Mjoin(atlas_,PRE),sysinfo.h))
40
#include "atlas_cacheedge.h"
43
#if !defined(CacheEdge) || CacheEdge == 0
44
#define MY_CE (4*ATL_MulBySize(ATL_L1elts))
46
#define MY_CE CacheEdge
49
#define MY_CE (4*ATL_MulBySize(ATL_L1elts))
53
* Guess as to whether GEMV-based SYRK will beat ref. Will on any arch
54
* where the compiler is not too whoopy (ARM) or where vectorization provides
55
* faster memory load as well as computation (x86 with vectorization).
56
* Single precision complex has enough flops/load to be almost always
57
* compute-bound enough that the auto-GEMV-tuning outweighs the double load.
59
#if defined(ATL_GAS_ARM)
60
#define USE_GEMV_BASED 1
63
#define USE_GEMV_BASED 1
67
#define USE_GEMV_BASED 1
70
#define USE_GEMV_BASED 1
73
#define USE_GEMV_BASED 1
78
typedef void (*ATL_symvK_t)
79
(const enum ATLAS_UPLO, const int, const SCALAR, const TYPE*, const int,
80
const TYPE*, const int, const SCALAR, TYPE*, const int);
95
const TYPE one[2] = {ATL_rone, ATL_rzero};
97
ATL_INT Mmb, mr, MB, j;
98
const size_t incA = (NB SHIFT)*lda;
99
const size_t opsize = ((size_t)(N+8)*(N+4))*(sizeof(TYPE)>>1)SHIFT;
100
void (*gemvT)(ATL_CINT, ATL_CINT, const SCALAR, const TYPE*, ATL_CINT,
101
const TYPE*, ATL_CINT, const SCALAR, TYPE*, ATL_CINT);
102
void (*gemvN)(ATL_CINT, ATL_CINT, const SCALAR, const TYPE*, ATL_CINT,
103
const TYPE*, ATL_CINT, const SCALAR, TYPE*, ATL_CINT);
107
gemvT = Mjoin(PATL,gemvT);
108
gemvN = Mjoin(PATL,gemvN_L2);
110
else if (opsize <= ATL_MulBySize(ATL_L1elts))
112
gemvT = Mjoin(PATL,gemvT_L1);
113
gemvN = Mjoin(PATL,gemvN_L1);
117
gemvT = Mjoin(PATL,gemvT_L2);
118
gemvN = Mjoin(PATL,gemvN_L2);
121
* Choose MB such that A is retained in L2 cache for second GEMV call
122
* If partial block is tiny, absorbe it into last block since cache is not
125
MB = ATL_DivBySize(MY_CE) / NB;
126
MB = (MB > N || MB < 240) ? N : MB;
128
for (j=0; j < N; j += NB, A += incA)
130
const register size_t j2 = j+j;
131
register int i, nb=N-j;
132
nb = (nb >= NB) ? NB : nb;
133
symvK(AtlasLower, nb, one, A+j2, lda, x+j2, 1, one, y+j2, 1);
134
for (i=j+nb; i < N; i += MB)
136
const register size_t i2 = i+i;
137
register int mb = N-i;
138
mb = (mb >= MB) ? MB : mb;
139
gemvT(mb, nb, one, A+i2, lda, xt+i2, 1, one, yt+j2, 1);
140
gemvN(mb, nb, one, A+i2, lda, x+j2, 1, one, y+i2, 1);
144
static void ATL_symvU
157
const TYPE one[2] = {ATL_rone, ATL_rzero};
159
ATL_INT Mmb, mr, MB, j;
160
const size_t incA = (NB SHIFT)*lda;
161
const size_t opsize = ((size_t)(N+8)*(N+4))*(sizeof(TYPE)>>1)SHIFT;
162
void (*gemvT)(ATL_CINT, ATL_CINT, const SCALAR, const TYPE*, ATL_CINT,
163
const TYPE*, ATL_CINT, const SCALAR, TYPE*, ATL_CINT);
164
void (*gemvN)(ATL_CINT, ATL_CINT, const SCALAR, const TYPE*, ATL_CINT,
165
const TYPE*, ATL_CINT, const SCALAR, TYPE*, ATL_CINT);
169
gemvT = Mjoin(PATL,gemvT);
170
gemvN = Mjoin(PATL,gemvN_L2);
172
else if (opsize <= ATL_MulBySize(ATL_L1elts))
174
gemvT = Mjoin(PATL,gemvT_L1);
175
gemvN = Mjoin(PATL,gemvN_L1);
179
gemvT = Mjoin(PATL,gemvT_L2);
180
gemvN = Mjoin(PATL,gemvN_L2);
183
* Choose MB such that A is retained in L2 cache for second GEMV call
184
* If partial block is tiny, absorbe it into last block since cache is not
187
MB = ATL_DivBySize(MY_CE) / NB;
188
MB = (MB > N || MB < 240) ? N : MB;
190
for (j=0; j < N; j += NB, A += incA)
192
const register size_t j2 = j+j;
193
register int i, nb=N-j;
194
nb = (nb >= NB) ? NB : nb;
195
for (i=0; i < j; i += MB)
197
const register size_t i2 = i+i;
198
register int mb = j-i;
199
mb = (mb >= MB) ? MB : mb;
200
gemvT(mb, nb, one, A+i2, lda, xt+i2, 1, one, yt+j2, 1);
201
gemvN(mb, nb, one, A+i2, lda, x+j2, 1, one, y+i2, 1);
203
symvK(AtlasUpper, nb, one, A+j2, lda, x+j2, 1, one, y+j2, 1);
208
void Mjoin(PATL,hemv)
210
const enum ATLAS_UPLO Uplo,
89
236
* ---------------------------------------------------------------------
92
* .. Local Variables ..
94
void (*gemv0)( const int, const int, const SCALAR,
95
const TYPE *, const int, const TYPE *, const int,
96
const SCALAR, TYPE *, const int );
97
void (*gemv1)( const int, const int, const SCALAR,
98
const TYPE *, const int, const TYPE *, const int,
99
const SCALAR, TYPE *, const int );
100
void (*gemvN)( const int, const int, const SCALAR,
101
const TYPE *, const int, const TYPE *, const int,
102
const SCALAR, TYPE *, const int );
107
#define zero ATL_rzero
108
void * vx = NULL, * vy = NULL;
110
TYPE * A0, * A1, * x0, * x1, * y00, * y0, * y1;
112
const int lda2 = ( LDA SHIFT );
113
const TYPE * alphaY, * beta0;
114
const TYPE one [2] = { ATL_rone, ATL_rzero },
115
zero[2] = { ATL_rzero, ATL_rzero };
116
void * vx = NULL, * vy = NULL;
118
TYPE * A0, * A1, * x0, * x1, * y00, * y0, * y1;
239
const int BETA0 = (*beta == ATL_rzero && beta[1] == ATL_rzero);
240
const int BETA1 = (*beta == ATL_rone && beta[1] == ATL_rzero);
241
const int ALPHA1 = (*alpha == ATL_rone && alpha[1] == ATL_rzero);
242
const int ALPHA0 = (*alpha == ATL_rzero && alpha[1] == ATL_rzero);
243
if (N <= 0 || (ALPHA0 && BETA1))
248
Mjoin(PATL,zero)(N, Y, incY);
250
Mjoin(PATL,scal)(N, beta, Y, incY);
253
#ifdef USE_GEMV_BASED
257
TYPE *x=(TYPE*)X, *y=Y, *xh, *yh;
258
const size_t tX = (size_t)X, tY = (size_t)Y, N2 = N+N;
259
const int COPYY = !(incY == 1 &&
260
(ATL_MulByCachelen(ATL_DivByCachelen(tY)) == tY));
261
const int COPYX = !(incX == 1 && (COPYY || ALPHA1) &&
262
(ATL_MulByCachelen(ATL_DivByCachelen(tX)) == tX));
263
const TYPE one[2] = {ATL_rone, ATL_rzero};
264
const TYPE *calp=one, *cbet=one;
266
tp = vp = malloc((COPYX+COPYY+2)*(ATL_Cachelen+ATL_MulBySize(N)));
269
Mjoin(PATL,refhemv)(Uplo, N, alpha, A, lda, X, incX, beta, Y, incY);
272
yh = ATL_AlignPtr(tp);
273
Mjoin(PATL,zero)(N, yh, 1);
275
xh = ATL_AlignPtr(tp);
279
x = ATL_AlignPtr(tp);
283
register const size_t incX2 = incX+incX;
285
for (i=0; i < N2; i += 2, xx += incX2)
288
xh[i+1] = -(x[i+1] = xx[1]);
291
else if (alpha[1] == ATL_rzero)
294
register const size_t incX2 = incX+incX;
295
register const TYPE ra=(*alpha), ia=alpha[1];
297
for (i=0; i < N2; i += 2, xx += incX2)
299
register TYPE rx = *xx, ix = xx[1];
309
register const size_t incX2 = incX+incX;
310
register const TYPE ra=(*alpha), ia=alpha[1];
312
for (i=0; i < N2; i += 2, xx += incX2)
314
register TYPE rx = *xx, ix = xx[1];
315
x[i] = rx*ra - ix*ia;
316
x[i+1] = rx*ia + ix*ra;
324
Mjoin(PATL,copyConj)(N, X, incX, xh, 1);
329
y = ATL_AlignPtr(tp);
330
Mjoin(PATL,zero)(N, y, 1);
333
Mjoin(PATL,zero)(N, y, 1);
335
Mjoin(PATL,scal)(N, beta, y, 1);
336
if (Uplo == AtlasLower)
337
ATL_symvL(Mjoin(PATL,refhemv), 120, N, A, lda, x, y, xh, yh);
339
ATL_symvU(Mjoin(PATL,refhemv), 120, N, A, lda, x, y, xh, yh);
342
Mjoin(PATL,axpbyConj)(N, alpha, yh, 1, calp, y, 1);
343
Mjoin(PATL,axpby)(N, one, y, 1, cbet, Y, incY);
346
Mjoin(PATL,axpyConj)(N, alpha, yh, 1, Y, incY);
120
int incA, incA1, incXY, incXY1, j, jb, mb, mb1, n, nb;
122
* .. Executable Statements ..
127
if( SCALAR_IS_ZERO( ALPHA ) )
129
if( !( SCALAR_IS_ONE( BETA ) ) ) Mjoin( PATL, scal )( N, BETA, Y, INCY );
133
if( ( INCX != 1 ) || ( ( INCY == 1 ) && !( SCALAR_IS_ONE( ALPHA ) ) ) )
135
vx = (void *)malloc( ATL_Cachelen + ATL_MulBySize( N ) );
136
ATL_assert( vx ); x = ATL_AlignPtr( vx );
137
Mjoin( PATL, cpsc )( N, ALPHA, X, INCX, x, 1 );
140
else { x = (TYPE *)(X); alphaY = ALPHA; }
142
if( ( INCY != 1 ) || !( SCALAR_IS_ONE( alphaY ) ) )
144
vy = malloc( ATL_Cachelen + ATL_MulBySize( N ) );
145
ATL_assert( vy ); y00 = y = ATL_AlignPtr( vy );
148
else { y00 = y = (TYPE *)(Y); beta0 = BETA; }
150
if( SCALAR_IS_ZERO( beta0 ) ) gemv0 = Mjoin( PATL, gemvC_a1_x1_b0_y1 );
151
else if( SCALAR_IS_ONE ( beta0 ) ) gemv0 = Mjoin( PATL, gemvC_a1_x1_b1_y1 );
152
else gemv0 = Mjoin( PATL, gemvC_a1_x1_bX_y1 );
153
gemv1 = Mjoin( PATL, gemvC_a1_x1_b1_y1 );
154
gemvN = Mjoin( PATL, gemvS_a1_x1_b1_y1 );
156
ATL_GetPartSYMV( A, LDA, &mb, &nb );
158
mb1 = N - ( ( N - 1 ) / mb ) * mb;
159
incA1 = nb * lda2; incXY1 = (nb SHIFT);
161
if( UPLO == AtlasUpper )
163
incA = ( incXY = (mb SHIFT) ) + mb * lda2;
164
A0 = (TYPE *)(A) + mb * lda2; x0 = x + incXY; y0 = y + incXY;
166
for( n = N - mb; n > 0; n -= mb, A0 += incA, A += incA, x0 += incXY,
167
x += incXY, y0 += incXY, y += incXY )
169
Mjoin( PATL, hemvU )( mb, A, LDA, x, beta0, y );
171
for( j = 0, A1 = A0, x1 = x0, y1 = y0; j < n;
172
j += nb, A1 += incA1, x1 += incXY1, y1 += incXY1 )
174
jb = n - j; jb = Mmin( jb, nb );
175
gemv0( jb, mb, one, A1, LDA, x, 1, beta0, y1, 1 );
176
gemvN( mb, jb, one, A1, LDA, x1, 1, one, y, 1 );
178
beta0 = one; gemv0 = gemv1;
180
Mjoin( PATL, hemvU )( mb1, A, LDA, x, beta0, y );
184
incA = incXY = (mb SHIFT);
185
A0 = (TYPE *)(A); x0 = x; y0 = y;
187
for( n = N - mb, A += ((N-mb) SHIFT), x += ((N-mb) SHIFT),
188
y += ((N-mb) SHIFT); n > 0; n -= mb, A -= incA, x -= incXY,
191
Mjoin( PATL, hemvL )( mb, A+n*lda2, LDA, x, beta0, y );
193
for( j = 0, A1 = (TYPE *)(A), x1 = x0, y1 = y0; j < n;
194
j += nb, A1 += incA1, x1 += incXY1, y1 += incXY1 )
196
jb = n - j; jb = Mmin( jb, nb );
197
gemv0( jb, mb, one, A1, LDA, x, 1, beta0, y1, 1 );
198
gemvN( mb, jb, one, A1, LDA, x1, 1, one, y, 1 );
200
beta0 = one; gemv0 = gemv1;
202
Mjoin( PATL, hemvL )( mb1, A0, LDA, x0, beta0, y0 );
207
{ Mjoin( PATL, axpby )( N, alphaY, y00, 1, BETA, Y, INCY ); free( vy ); }
209
* End of Mjoin( PATL, hemv )
351
Mjoin(PATL,refhemv)(Uplo, N, alpha, A, lda, X, incX, beta, Y, incY);