2
* Automatically Tuned Linear Algebra Software v3.8.4
3
* (C) Copyright 1999 R. Clint Whaley
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions, and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
13
* 3. The name of the ATLAS group or the names of its contributers may
14
* not be used to endorse or promote products derived from this
15
* software without specific written permission.
17
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
21
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
* POSSIBILITY OF SUCH DAMAGE.
31
#include "atlas_misc.h"
32
#include "atlas_level1.h"
33
#include "atlas_level2.h"
35
static void gemvT_Nsmall(const int M, const int N, const TYPE *A, const int lda,
36
const TYPE *X, const SCALAR beta, TYPE *Y)
39
register TYPE x0, x1, x2, x3, x4, x5, x6, x7, y0;
46
Mjoin(PATL,cpsc)(M, *X, A, lda, Y, 1);
48
Mjoin(PATL,axpby)(M, *X, A, lda, beta, Y, 1);
50
Mjoin(PATL,axpy)(M, *X, A, lda, Y, 1);
59
*Y = x0 * *A + x1 * A[1];
62
*Y = y0 * beta + x0 * *A + x1 * A[1];
64
*Y += x0 * *A + x1 * A[1];
77
*Y = x0 * *A + x1 * A[1] + x2 * A[2];
80
*Y = y0 * beta + x0 * *A + x1 * A[1] + x2 * A[2];
82
*Y += x0 * *A + x1 * A[1] + x2 * A[2];
96
*Y = x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3];
99
*Y = y0 * beta + x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3];
101
*Y += x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3];
116
*Y = x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3]
120
*Y = y0 * beta + x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3]
123
*Y += x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3] + x4 * A[4];
139
*Y = x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3]
140
+ x4 * A[4] + x5 * A[5];
143
*Y = y0 * beta + x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3]
144
+ x4 * A[4] + x5 * A[5];
146
*Y += x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3]
147
+ x4 * A[4] + x5 * A[5];
164
*Y = x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3]
165
+ x4 * A[4] + x5 * A[5] + x6 * A[6];
168
*Y = y0 * beta + x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3]
169
+ x4 * A[4] + x5 * A[5] + x6 * A[6];
171
*Y += x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3]
172
+ x4 * A[4] + x5 * A[5] + x6 * A[6];
190
*Y = x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3]
191
+ x4 * A[4] + x5 * A[5] + x6 * A[6] + x7 * A[7];
194
*Y = y0 * beta + x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3]
195
+ x4 * A[4] + x5 * A[5] + x6 * A[6] + x7 * A[7];
197
*Y += x0 * *A + x1 * A[1] + x2 * A[2] + x3 * A[3]
198
+ x4 * A[4] + x5 * A[5] + x6 * A[6] + x7 * A[7];
216
for (i=0; i != N; i++) y0 += A[i] * X[i];
225
static void gemvT_Msmall(const int M, const int N, const TYPE *A, const int lda,
226
const TYPE *X, const SCALAR beta, TYPE *Y)
228
const TYPE *stY = Y + M;
236
*Y = Mjoin(PATL,dot)(N, X, 1, A, 1);
242
y0 += Mjoin(PATL,dot)(N, X, 1, A, 1);
250
#define gemv0 Mjoin(Mjoin(Mjoin(Mjoin(gemvT,NM),_x1),BNM),_y1)
251
void Mjoin(PATL,gemv0)
252
(const int M, const int N, const SCALAR alpha,
253
const TYPE *A, const int lda, const TYPE *X, const int incX,
254
const SCALAR beta, TYPE *Y, const int incY)
257
const int M2 = (M>>1)<<1, n4 = (N>>2)<<2;
258
const int nr = N-n4, incAm = (lda<<1) - n4;
259
const TYPE *stX = X + n4 - 4, *x, *A0 = A, *A1 = A + lda;
261
register TYPE y0, y1, yy0, yy1, x0, x1, x2, x3, m0, m1, m2, m3;
262
register TYPE a00, a10, a20, a30, a01, a11, a21, a31;
272
y0 = y1 = yy0 = yy1 = ATL_rzero;
275
yy0 = *Y * y0; yy1 = Y[1] * y0;
278
yy0 = *Y; yy1 = Y[1];
281
a00 = *A0; a10 = A0[1]; a20 = A0[2]; a30 = A0[3]; A0 += 4;
282
a01 = *A1; a11 = A1[1]; a21 = A1[2]; a31 = A1[3]; A1 += 4;
283
x0 = *X; x1 = X[1]; x2 = X[2]; x3 = X[3];
284
m0 = a00 * x0; a00 = *A0;
285
m1 = a01 * x0; a01 = *A1; x0 = *x;
286
m2 = a10 * x1; a10 = A0[1];
287
m3 = a11 * x1; a11 = A1[1]; x1 = x[1];
293
y0 += m0; m0 = a20 * x2; a20 = A0[2];
294
y1 += m1; m1 = a21 * x2; a21 = A1[2]; x2 = x[2];
295
yy0 += m2; m2 = a30 * x3; a30 = A0[3]; A0 += 4;
296
yy1 += m3; m3 = a31 * x3; a31 = A1[3]; A1 += 4;
298
y0 += m0; m0 = a00 * x0; a00 = *A0;
299
y1 += m1; m1 = a01 * x0; a01 = *A1; x0 = *x;
300
yy0 += m2; m2 = a10 * x1; a10 = A0[1];
301
yy1 += m3; m3 = a11 * x1; a11 = A1[1]; x1 = x[1];
306
* Drain pipes, then do cleanup
308
y0 += m0; m0 = a20 * x2; a20 = A0[2];
309
y1 += m1; m1 = a21 * x2; a21 = A1[2]; x2 = x[2];
310
yy0 += m2; m2 = a30 * x3; a30 = A0[3]; A0 += 4;
311
yy1 += m3; m3 = a31 * x3; a31 = A1[3]; A1 += 4; x3 = x[3]; x += 4;
312
y0 += m0; m0 = a00 * x0;
313
y1 += m1; m1 = a01 * x0;
314
yy0 += m2; m2 = a10 * x1;
315
yy1 += m3; m3 = a11 * x1;
317
y0 += m0; m0 = a20 * x2;
318
y1 += m1; m1 = a21 * x2;
319
yy0 += m2; m2 = a30 * x3;
320
yy1 += m3; m3 = a31 * x3;
367
*Y = Mjoin(PATL,dot)(N, X, 1, A0, 1);
373
y0 += Mjoin(PATL,dot)(N, X, 1, A0, 1);
378
else if (M) gemvT_Nsmall(M, N, A, lda, X, beta, Y);