1
/* ---------------------------------------------------------------------
3
* -- Automatically Tuned Linear Algebra Software (ATLAS)
4
* (C) Copyright 2000 All Rights Reserved
6
* -- ATLAS routine -- Version 3.2 -- December 25, 2000
8
* Author : Antoine P. Petitet
9
* Contributor(s) : R. Clint Whaley
10
* Originally developed at the University of Tennessee,
11
* Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA.
13
* ---------------------------------------------------------------------
15
* -- Copyright notice and Licensing terms:
17
* Redistribution and use in source and binary forms, with or without
18
* modification, are permitted provided that the following conditions
21
* 1. Redistributions of source code must retain the above copyright
22
* notice, this list of conditions and the following disclaimer.
23
* 2. Redistributions in binary form must reproduce the above copyright
24
* notice, this list of conditions, and the following disclaimer in
25
* the documentation and/or other materials provided with the distri-
27
* 3. The name of the University, the ATLAS group, or the names of its
28
* contributors may not be used to endorse or promote products deri-
29
* ved from this software without specific written permission.
33
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
34
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
35
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
36
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
37
* OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
38
* CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
39
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
40
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO-
41
* RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN-
42
* CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
43
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45
* ---------------------------------------------------------------------
50
1
#include "atlas_misc.h"
51
2
#include "atlas_level1.h"
52
#include "atlas_kernel2.h"
53
3
#include "atlas_lvl2.h"
57
#include "atlas_reflvl2.h" /* temporary for building purposes */
58
#include "atlas_reflevel2.h" /* used for gbmv, gpmv and gpr. */
60
void Mjoin( PATL, syr2 )
62
const enum ATLAS_UPLO UPLO,
77
* Mjoin( PATL, syr2 ) performs the symmetric rank 2 operation
79
* A := alpha * x * y' + alpha * y * x' + A,
81
* where alpha is a scalar, x and y are n-element vectors and A is an n
82
* by n symmetric matrix.
84
* This is a blocked version of the algorithm. For a more detailed des-
85
* cription of the arguments of this function, see the reference imple-
86
* mentation in the ATLAS/src/blas/reference directory.
88
* ---------------------------------------------------------------------
91
* .. Local Variables ..
97
const TYPE one[2] = { ATL_rone, ATL_rzero };
98
const int lda2 = ( LDA SHIFT );
100
TYPE * A0, * A1, * x, * x0, * x1, * y, * y0, * y1;
101
void * vx = NULL, * vy = NULL;
102
int incA, incA0, incA1, incXY, incXY1, j, jb,
105
#define ger1 Mjoin( PATL, ger1_a1_x1_yX )
107
#define ger1 Mjoin( PATL, ger1u_a1_x1_yX )
110
* .. Executable Statements ..
113
if( ( N == 0 ) || ( SCALAR_IS_ZERO( ALPHA ) ) ) return;
117
vx = (TYPE *)malloc( ATL_Cachelen + ATL_MulBySize( N ) );
118
ATL_assert( vx ); x = ATL_AlignPtr( vx );
119
Mjoin( PATL, cpsc )( N, ALPHA, X, INCX, x, 1 );
121
else { x = (TYPE *)(X); }
123
if( ( ( vx != NULL ) && ( INCY != 1 ) ) ||
124
( ( vx == NULL ) && ( !( SCALAR_IS_ONE( ALPHA ) ) || ( INCY != 1 ) ) ) )
126
vy = (TYPE *)malloc( ATL_Cachelen + ATL_MulBySize( N ) );
127
ATL_assert( vy ); y = ATL_AlignPtr( vy );
128
if( ( vx != NULL ) && ( INCY != 1 ) )
130
Mjoin( PATL, copy )( N, Y, INCY, y, 1 );
4
#include "atlas_reflvl2.h"
5
#include "atlas_reflevel2.h"
7
#include Mstr(Mjoin(Mjoin(atlas_,PRE),syr2_L1.h))
8
#define ATL_syr2 Mjoin(PATL,syr2_L1)
9
#elif defined(ATL_INL2)
10
#include Mstr(Mjoin(Mjoin(atlas_,PRE),syr2_L2.h))
11
#define ATL_syr2 Mjoin(PATL,syr2_L2)
13
#include Mstr(Mjoin(Mjoin(atlas_,PRE),syr2.h))
14
#define ATL_syr2 Mjoin(PATL,syr2)
18
extern int ATL_KERN_NX;
19
#define ATL_S2NX ATL_KERN_NX
21
#include Mstr(Mjoin(Mjoin(atlas_,PRE),syr2NX.h))
28
void Mjoin(PATL,syr2_kU)
30
ATL_r2kern_t gerk0, /* func ptr to selected GER kernel */
31
ATL_CINT N, /* size of prob to solve */
32
const TYPE *x, /* vector x -- alpha applied to x or y */
33
const TYPE *y, /* vector y -- alpha applied to x or y */
34
TYPE *A, /* symmetric matrix, A = A + x*y^T + y*x^T */
35
ATL_CINT lda /* row stride of A */
38
ATL_r2kern_t gerk=gerk0;
39
ATL_INT nx=(ATL_S2NX/ATL_s2U_NU)*ATL_s2U_NU, j;
40
ATL_CINT NN = (N/ATL_s2U_NU)*ATL_s2U_NU;
42
nx = (ATL_S2NX >= ATL_s2U_NU) ? (ATL_S2NX/ATL_s2U_NU)*ATL_s2U_NU
45
Mjoin(PATL,refsyr2U)(nx, ATL_rone, x, 1, y, 1, A, lda);
48
for (j=nx; j < NN; j += ATL_s2U_NU)
50
#if ATL_MIN_RESTRICTED_M > 0
51
gerk = (j >= ATL_MIN_RESTRICTED_M) ? gerk0 : ATL_GENGERK;
53
gerk(j, ATL_s2U_NU, x, y+j, y, x+j, A+j*lda, lda);
54
ATL_SYR2U_nu(A+j*(lda+1), lda, x+j, y+j);
59
ATL_GENGERK(NN, nx, x, y+NN, y, x+NN, A+NN*lda, lda);
60
Mjoin(PATL,refsyr2U)(nx, ATL_rone, x+NN, 1, y+NN, 1, A+NN*(lda+1), lda);
64
void Mjoin(PATL,syr2_kL)
66
ATL_r2kern_t gerk0, /* func ptr to selected GER kernel */
67
ATL_CINT N, /* size of prob to solve */
68
const TYPE *x, /* vector x -- alpha applied to x or y */
69
const TYPE *y, /* vector y -- alpha applied to x or y */
70
TYPE *A, /* symmetric matrix, A = A + x*y^T + y*x^T */
71
ATL_CINT lda /* row stride of A */
74
ATL_r2kern_t gerk=gerk0;
75
ATL_INT nx=Mmin(ATL_S2NX,N), i, NN, n;
76
ATL_CINT incA = ATL_s2L_NU*(lda+1);
79
i = (i/ATL_s2L_NU)*ATL_s2L_NU;
83
for (i=0; i < NN; i += ATL_s2L_NU)
85
ATL_SYR2L_nu(A, lda, x, y);
87
#if ATL_MIN_RESTRICTED_M > 0
88
gerk = (n >= ATL_MIN_RESTRICTED_M) ? gerk0 : ATL_GENGERK;
90
gerk(n, ATL_s2L_NU, x+ATL_s2L_NU, y, y+ATL_s2L_NU, x, A+ATL_s2L_NU, lda);
95
Mjoin(PATL,refsyr2L)(nx, ATL_rone, x, 1, y, 1, A, lda);
98
void Mjoin(PATL,syr2)(const enum ATLAS_UPLO Uplo, ATL_CINT N,
99
const SCALAR alpha, const TYPE *X, ATL_CINT incX,
100
const TYPE *Y, ATL_CINT incY, TYPE *A, ATL_CINT lda)
103
TYPE *x, *xt, *y, *yt;
106
const int ALP1 = (alpha == ATL_rone);
107
int nu = (Uplo == AtlasUpper) ? ATL_s2U_NU : ATL_s2L_NU;
108
int mu, minM, minN, alignX, alignXt, FNU, ALIGNX2A, COPYX, COPYY;
109
int ApplyAlphaToXt=0, ApplyAlphaToYt=0, YisYt, XisXt, COPYXt, COPYYt;
112
if (N < 1 || SCALAR_IS_ZERO(alpha))
115
* For small problems, avoid overhead of func calls & data copy
119
Mjoin(PATL,refsyr2)(Uplo, N, alpha, X, incX, Y, incY, A, lda);
123
* Determine the GER kernel to use, and its parameters
125
gerk = ATL_GetR2Kern(N-nu, nu, A, lda, &mu, &nu, &minM, &minN, &alignX,
126
&ALIGNX2A, &alignXt, &FNU, &CacheElts);
128
* See if it is OK to have transpose vectors same as no-transpose
130
YisYt = XisXt = ALP1;
131
if (!YisYt && alignXt > sizeof(TYPE)) /* align rest may prevent */
135
const size_t t1 = (size_t) A;
136
if ((t1/alignXt)*alignXt != t1)
139
else if (alignXt > alignX)
141
if ((alignXt/alignX)*alignX != alignXt)
146
else if ((alignX/alignXt)*alignXt != alignX)
150
* See if we have to copy the no-transpose vectors
153
if (!COPYY) /* may still need to copy due to alignment issues */
156
* ATL_Cachelen is the highest alignment that can be requested, so
157
* make Y's modulo with Cachelen match that of A if you want A & Y
158
* to have the same alignment
162
const size_t t1 = (size_t) A, t2 = (size_t) Y;
163
COPYY = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) !=
164
(t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2)));
168
size_t t1 = (size_t) Y;
169
COPYY = ((t1/alignX)*alignX != t1);
173
if (!COPYX) /* may still need to copy due to alignment issues */
176
* ATL_Cachelen is the highest alignment that can be requested, so
177
* make X's modulo with Cachelen match that of A if you want A & X
178
* to have the same alignment
182
size_t t1 = (size_t) A, t2 = (size_t) X;
183
COPYX = (t1 - ATL_MulByCachelen(ATL_DivByCachelen(t1))) !=
184
(t2 - ATL_MulByCachelen(ATL_DivByCachelen(t2)));
188
size_t t1 = (size_t) X;
189
COPYX = ((t1/alignX)*alignX != t1);
193
* See if we have to copy the transpose vectors
195
COPYYt = (incY != 1);
196
if (!COPYYt && alignXt > sizeof(TYPE))
197
{ /* may still need copy due to alignment issues */
198
size_t t1 = (size_t) Y;
199
COPYYt = ((t1/alignXt)*alignXt != t1);
201
COPYXt = (incX != 1);
202
if (!COPYXt && alignXt > sizeof(TYPE))
203
{ /* may still need copy due to alignment issues */
204
size_t t1 = (size_t) X;
205
COPYXt = ((t1/alignXt)*alignXt != t1);
208
* See if applying alpha will force a copy; must apply alpha to either
209
* no-transpose or transpose vectors, not mixture
213
if (!COPYX && !COPYXt)
134
Mjoin( PATL, cpsc )( N, ALPHA, Y, INCY, y, 1 );
137
else { y = (TYPE *)(Y); }
139
ATL_GetPartR1( A, LDA, mb, nb );
141
mb1 = N - ( ( N - 1 ) / mb ) * mb;
142
incA = ( incXY = (mb SHIFT) ) + mb * lda2;
143
incA1 = nb * lda2; incXY1 = (nb SHIFT);
145
if( UPLO == AtlasLower )
149
Mjoin( PATL, syr2L )( mb1, x, y, A, LDA );
151
A0 = (TYPE *)( A += (mb1 SHIFT) ); A += mb1 * lda2;
152
x0 = x; x += (mb1 SHIFT);
153
y0 = y; y += (mb1 SHIFT);
156
n += mb, A0 += incA0, A += incA, x += incXY, y += incXY )
158
for( j = 0, A1 = A0, x1 = x0, y1 = y0; j < n;
159
j += nb, A1 += incA1, x1 += incXY1, y1 += incXY1 )
161
jb = n - j; jb = Mmin( jb, nb );
162
ger1( mb, jb, one, x, 1, y1, 1, A1, LDA );
163
ger1( mb, jb, one, y, 1, x1, 1, A1, LDA );
165
Mjoin( PATL, syr2L )( mb, x, y, A, LDA );
216
ApplyAlphaToXt = !COPYX;
218
COPYYt = ApplyAlphaToYt = 1;
219
else /* must apply alpha to Y */
223
* Compute amount of space necessary to allocate any needed vectors
225
len = (!YisYt) ? (COPYY + COPYYt) : (COPYY || COPYYt);
226
len += (!XisXt) ? (COPYX + COPYXt) : (COPYX || COPYXt);
227
len *= ATL_MulBySize(N) + ATL_Cachelen;
233
tp = vp = malloc(len);
236
Mjoin(PATL,refsyr2)(Uplo, N, alpha, X, incX, Y, incY, A, lda);
243
tp = y = yt = (ALIGNX2A)?ATL_Align2Ptr(tp, A):ATL_AlignPtr(tp);
247
tp = yt = ATL_AlignPtr(tp);
252
tp = y = ALIGNX2A ? ATL_Align2Ptr(tp, A) : ATL_AlignPtr(tp);
259
tp = x = xt = (ALIGNX2A)?ATL_Align2Ptr(tp, A):ATL_AlignPtr(tp);
263
tp = xt = ATL_AlignPtr(tp);
267
x = ALIGNX2A ? ATL_Align2Ptr(tp, A) : ATL_AlignPtr(tp);
270
* Copy vector(s) to workspace with one pass through the input vectors
276
if (ALP1) /* no scaling */
279
for (i=0; i < N; i++, X += incX)
282
else /* when both vecs copied, apply alpha to one of them */
285
const register TYPE ra=alpha;
297
for (i=0; i < N; i++, X += incX)
299
const register TYPE rx=(*X);
308
Mjoin(PATL,cpsc)(N, alpha, X, incX, xt, 1);
310
Mjoin(PATL,copy)(N, X, incX, xt, 1);
315
Mjoin(PATL,cpsc)(N, alpha, X, incX, x, 1);
317
Mjoin(PATL,copy)(N, X, incX, x, 1);
324
if (ALP1) /* no scaling */
327
for (i=0; i < N; i++, Y += incY)
330
else /* when both vecs copied, apply alpha to transposed vec */
333
const register TYPE ra=alpha;
345
for (i=0; i < N; i++, Y += incY)
347
const register TYPE ry=(*Y);
356
Mjoin(PATL,cpsc)(N, alpha, Y, incY, yt, 1);
358
Mjoin(PATL,copy)(N, Y, incY, yt, 1);
363
Mjoin(PATL,cpsc)(N, alpha, Y, incY, y, 1);
365
Mjoin(PATL,copy)(N, Y, incY, y, 1);
368
if (Uplo == AtlasUpper)
369
Mjoin(PATL,syr2_kU)(gerk, N, x, yt, A, lda);
171
A0 = (TYPE *)(A) + mb * lda2, x0 = x + incXY, y0 = y + incXY;
174
A += incA, A0 += incA,
175
x += incXY, x0 += incXY, y += incXY, y0 += incXY )
177
Mjoin( PATL, syr2U )( mb, x, y, A, LDA );
179
for( j = 0, A1 = A0, x1 = x0, y1 = y0; j < n;
180
j += nb, A1 += incA1, x1 += incXY1, y1 += incXY1 )
182
jb = n - j; jb = Mmin( jb, nb );
183
ger1( mb, jb, one, x, 1, y1, 1, A1, LDA );
184
ger1( mb, jb, one, y, 1, x1, 1, A1, LDA );
187
Mjoin( PATL, syr2U )( mb1, x, y, A, LDA );
193
* End of Mjoin( PATL, syr2 )
371
Mjoin(PATL,syr2_kL)(gerk, N, x, yt, A, lda);