2
* Automatically Tuned Linear Algebra Software v3.8.4
3
* (C) Copyright 1999 R. Clint Whaley
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions, and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
13
* 3. The name of the ATLAS group or the names of its contributers may
14
* not be used to endorse or promote products derived from this
15
* software without specific written permission.
17
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
21
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
* POSSIBILITY OF SUCH DAMAGE.
31
#include "atlas_misc.h"
32
#include "atlas_level1.h"
33
#include "atlas_level2.h"
34
#include "atlas_lvl2.h"
37
static void gemvMlt8(const int M, const int N, const TYPE *A, const int lda,
38
const TYPE *X, const SCALAR beta, TYPE *Y);
39
static void gemvNle4(const int M, const int N, const TYPE *A, const int lda,
40
const TYPE *X, const SCALAR beta, TYPE *Y);
42
#define Yget(y_, yp_, bet_) (y_) = ATL_rzero
44
#define Yget(y_, yp_, bet_) (y_) = (yp_) * (bet_)
46
#define Yget(y_, yp_, bet_) (y_) = (yp_)
48
static void gemvN32x4(const int M, const int N, const TYPE *A, const int lda,
49
const TYPE *x, const SCALAR beta0, TYPE *y)
51
* rank-4 daxpy based NoTrans gemv
54
const int M16 = (M>>4)<<4;
55
TYPE *stY = y + M16 - 32;
56
const TYPE *A0 = A, *A1 = A+lda, *A2 = A1 + lda, *A3 = A2 + lda;
57
register TYPE z0, z1, z2, z3, z4, z5, z6, z7;
58
register TYPE y0, y1, y2, y3, y4, y5, y6, y7;
59
const register TYPE x0 = *x, x1 = x[1], x2 = x[2], x3 = x[3];
61
const register TYPE beta = beta0;
70
y0 = y1 = y2 = y3 = y4 = y5 = y6 = y7 = ATL_rzero;
72
y0 = *y; y1 = y[1]; y2 = y[2]; y3 = y[3];
73
y4 = y[4]; y5 = y[5]; y6 = y[6]; y7 = y[7];
75
y0 *= beta; y1 *= beta; y2 *= beta; y3 *= beta;
76
y4 *= beta; y5 *= beta; y6 *= beta; y7 *= beta;
79
y0 += x0 * *A0; Yget(z0, y[8], beta);
81
y2 += x2 * A2[2]; Yget(z1, y[9], beta);
83
y4 += x0 * A0[4]; Yget(z2, y[10], beta);
85
y6 += x2 * A2[6]; Yget(z3, y[11], beta);
88
y0 += x1 * *A1; Yget(z4, y[12], beta);
90
y2 += x3 * A3[2]; Yget(z5, y[13], beta);
92
y4 += x1 * A1[4]; Yget(z6, y[14], beta);
94
y6 += x3 * A3[6]; Yget(z7, y[15], beta);
115
z0 += x0 * A0[8]; *y = y0;
117
z2 += x2 * A2[10]; y[1] = y1;
119
z4 += x0 * A0[12]; y[2] = y2;
121
z6 += x2 * A2[14]; y[3] = y3;
124
z0 += x1 * A1[8]; y[4] = y4;
126
z2 += x3 * A3[10]; y[5] = y5;
128
z4 += x1 * A1[12]; y[6] = y6;
130
z6 += x3 * A3[14]; y[7] = y7;
133
z0 += x2 * A2[8]; Yget(y0, y[16], beta);
135
z2 += x0 * A0[10]; Yget(y1, y[17], beta);
137
z4 += x2 * A2[12]; Yget(y2, y[18], beta);
139
z6 += x0 * A0[14]; Yget(y3, y[19], beta);
142
z0 += x3 * A3[8]; Yget(y4, y[20], beta);
144
z2 += x1 * A1[10]; Yget(y5, y[21], beta);
146
z4 += x3 * A3[12]; Yget(y6, y[22], beta); A3 += 16;
147
z5 += x0 * A0[13]; A0 += 16;
148
z6 += x1 * A1[14]; Yget(y7, y[23], beta); A1 += 16;
149
z7 += x2 * A2[15]; A2 += 16;
154
y0 += x0 * *A0; y[8] = z0;
156
y2 += x2 * A2[2]; y[9] = z1;
158
y4 += x0 * A0[4]; y[10] = z2;
160
y6 += x2 * A2[6]; y[11] = z3;
163
y0 += x1 * *A1; y[12] = z4;
165
y2 += x3 * A3[2]; y[13] = z5;
167
y4 += x1 * A1[4]; y[14] = z6;
169
y6 += x3 * A3[6]; y[15] = z7; y += 16;
172
y0 += x2 * *A2; Yget(z0, y[8], beta);
174
y2 += x0 * A0[2]; Yget(z1, y[9], beta);
176
y4 += x2 * A2[4]; Yget(z2, y[10], beta);
178
y6 += x0 * A0[6]; Yget(z3, y[11], beta);
181
y0 += x3 * *A3; Yget(z4, y[12], beta);
183
y2 += x1 * A1[2]; Yget(z5, y[13], beta);
185
y4 += x3 * A3[4]; Yget(z6, y[14], beta);
187
y6 += x1 * A1[6]; Yget(z7, y[15], beta);
190
z0 += x0 * A0[8]; *y = y0;
192
z2 += x2 * A2[10]; y[1] = y1;
194
z4 += x0 * A0[12]; y[2] = y2;
196
z6 += x2 * A2[14]; y[3] = y3;
199
z0 += x1 * A1[8]; y[4] = y4;
201
z2 += x3 * A3[10]; y[5] = y5;
203
z4 += x1 * A1[12]; y[6] = y6;
205
z6 += x3 * A3[14]; y[7] = y7;
208
z0 += x2 * A2[8]; Yget(y0, y[16], beta);
210
z2 += x0 * A0[10]; Yget(y1, y[17], beta);
212
z4 += x2 * A2[12]; Yget(y2, y[18], beta);
214
z6 += x0 * A0[14]; Yget(y3, y[19], beta);
217
z0 += x3 * A3[8]; Yget(y4, y[20], beta);
219
z2 += x1 * A1[10]; Yget(y5, y[21], beta);
221
z4 += x3 * A3[12]; Yget(y6, y[22], beta); A3 += 16;
222
z5 += x0 * A0[13]; A0 += 16;
223
z6 += x1 * A1[14]; Yget(y7, y[23], beta); A1 += 16;
224
z7 += x2 * A2[15]; A2 += 16;
228
y0 += x0 * *A0; y[8] = z0;
230
y2 += x2 * A2[2]; y[9] = z1;
232
y4 += x0 * A0[4]; y[10] = z2;
234
y6 += x2 * A2[6]; y[11] = z3;
237
y0 += x1 * *A1; y[12] = z4;
239
y2 += x3 * A3[2]; y[13] = z5;
241
y4 += x1 * A1[4]; y[14] = z6;
243
y6 += x3 * A3[6]; y[15] = z7; y += 16;
246
y0 += x2 * *A2; Yget(z0, y[8], beta);
248
y2 += x0 * A0[2]; Yget(z1, y[9], beta);
250
y4 += x2 * A2[4]; Yget(z2, y[10], beta);
252
y6 += x0 * A0[6]; Yget(z3, y[11], beta);
255
y0 += x3 * *A3; Yget(z4, y[12], beta);
257
y2 += x1 * A1[2]; Yget(z5, y[13], beta);
259
y4 += x3 * A3[4]; Yget(z6, y[14], beta);
261
y6 += x1 * A1[6]; Yget(z7, y[15], beta);
264
z0 += x0 * A0[8]; *y = y0;
266
z2 += x2 * A2[10]; y[1] = y1;
268
z4 += x0 * A0[12]; y[2] = y2;
270
z6 += x2 * A2[14]; y[3] = y3;
273
z0 += x1 * A1[8]; y[4] = y4;
275
z2 += x3 * A3[10]; y[5] = y5;
277
z4 += x1 * A1[12]; y[6] = y6;
279
z6 += x3 * A3[14]; y[7] = y7;
307
if (M-M16) gemvMlt8(M-M16, N, A0+16, lda, x, beta, y+16);
309
else if (N) gemvMlt8(M, N, A, lda, x, beta, y);
312
static void gemv32x4(const int M, const int N, const TYPE *A, const int lda,
313
const TYPE *X, const SCALAR beta, TYPE *Y)
318
const int incA = lda<<2;
325
for (j=(N>>2); j; j--, A += incA, X += 4)
326
gemvN32x4(M, 4, A, lda, X, ATL_rone, Y);
327
if ( (j = N-((N>>2)<<2)) ) gemvNle4(M, j, A, lda, X, ATL_rone, Y);
329
gemvN32x4(M, 4, A, lda, X, beta, Y);
331
Mjoin(PATL,gemvN_a1_x1_b1_y1)
332
(M, N-4, ATL_rone, A+incA, lda, X+4, 1, ATL_rone, Y, 1);
335
else gemvMlt8(M, N, A, lda, X, beta, Y);
337
else if (M) gemvNle4(M, N, A, lda, X, beta, Y);
340
static void gemvMlt8(const int M, const int N, const TYPE *A, const int lda,
341
const TYPE *X, const SCALAR beta, TYPE *Y)
348
y0 = Mjoin(PATL,dot)(N, A, lda, X, 1);
351
y0 += Mjoin(PATL,dot)(N, A, lda, X, 1);
357
static void gemvNle4(const int M, const int N, const TYPE *A, const int lda,
358
const TYPE *X, const SCALAR beta, TYPE *Y)
361
const TYPE *A0 = A, *A1 = A+lda, *A2 = A1+lda, *A3 = A2+lda;
362
register TYPE x0, x1, x2, x3;
364
const register TYPE bet=beta;
371
Mjoin(PATL,cpsc)(M, *X, A, 1, Y, 1);
373
Mjoin(PATL,axpby)(M, *X, A, 1, beta, Y, 1);
375
Mjoin(PATL,axpy)(M, *X, A, 1, Y, 1);
380
for (i=0; i != M; i++)
382
Y[i] = A0[i] * x0 + A1[i] * x1;
384
Y[i] = Y[i]*bet + A0[i] * x0 + A1[i] * x1;
386
Y[i] += A0[i] * x0 + A1[i] * x1;
390
x0 = *X; x1 = X[1]; x2 = X[2];
391
for (i=0; i != M; i++)
393
Y[i] = A0[i] * x0 + A1[i] * x1 + A2[i] * x2;
395
Y[i] = Y[i]*bet + A0[i] * x0 + A1[i] * x1 + A2[i] * x2;
397
Y[i] += A0[i] * x0 + A1[i] * x1 + A2[i] * x2;
401
if (M >= 32) gemv32x4(M, 4, A, lda, X, beta, Y);
404
x0 = *X; x1 = X[1]; x2 = X[2]; x3 = X[3];
405
for (i=0; i != M; i++)
407
Y[i] = A0[i] * x0 + A1[i] * x1 + A2[i] * x2 + A3[i] * x3;
409
Y[i] = Y[i]*bet + A0[i] * x0 + A1[i] * x1 + A2[i] * x2 + A3[i] * x3;
411
Y[i] += A0[i] * x0 + A1[i] * x1 + A2[i] * x2 + A3[i] * x3;
420
static void gemv16x2(const int M, const int N, const TYPE *A, const int lda,
421
const TYPE *X, const SCALAR beta, TYPE *Y)
423
* 16x2 with feeble prefetch
426
const int M16 = (M>>4)<<4, N2 = (N>>1)<<1, nr = N-N2;
427
const int incA = lda << 1, incAm = 16 - N2*lda;
428
const TYPE *stX = X + N2 - 2, *x;
429
const TYPE *A0 = A, *A1 = A + lda;
431
register TYPE x0, x1;
432
register TYPE y0, y1, y2, y3, y4, y5, y6, y7;
433
register TYPE y8, y9, y10, y11, y12, y13, y14, y15;
434
register TYPE p0, p1;
443
y0 = y1 = y2 = y3 = y4 = y5 = y6 = y7 =
444
y8 = y9 = y10 = y11 = y12 = y13 = y14 = y15 = ATL_rzero;
447
y0 = *Y; y1 = Y[1]; y2 = Y[2]; y3 = Y[3];
448
y4 = Y[4]; y5 = Y[5]; y6 = Y[6]; y7 = Y[7];
449
y8 = Y[8]; y9 = Y[9]; y10 = Y[10]; y11 = Y[11];
450
y12 = Y[12]; y13 = Y[13]; y14 = Y[14]; y15 = Y[15];
451
y0 *= x0; y1 *= x0; y2 *= x0; y3 *= x0;
452
y4 *= x0; y5 *= x0; y6 *= x0; y7 *= x0;
453
y8 *= x0; y9 *= x0; y10 *= x0; y11 *= x0;
454
y12 *= x0; y13 *= x0; y14 *= x0; y15 *= x0;
456
y0 = *Y; y1 = Y[1]; y2 = Y[2]; y3 = Y[3];
457
y4 = Y[4]; y5 = Y[5]; y6 = Y[6]; y7 = Y[7];
458
y8 = Y[8]; y9 = Y[9]; y10 = Y[10]; y11 = Y[11];
459
y12 = Y[12]; y13 = Y[13]; y14 = Y[14]; y15 = Y[15];
466
x0 = *x; x1 = x[1]; x += 2;
467
y0 += x0 * p0; p0 = A0[incA];
468
y1 += x1 * p1; p1 = A1[incA+1];
498
y14 += x1 * A1[14]; A1 += incA;
499
y15 += x0 * A0[15]; A0 += incA;
502
if (!nr) /* 2 cols left */
504
x0 = *x; x1 = x[1]; x += 2;
536
y14 += x1 * A1[14]; A1 += incA;
537
y15 += x0 * A0[15]; A0 += incA;
539
else /* 3 cols left */
541
x0 = *x; x1 = x[1]; x += 2;
542
y0 += x0 * p0; p0 = A0[incA];
573
y14 += x1 * A1[14]; A1 += incA;
574
y15 += x0 * A0[15]; A0 += incA;
616
if (M-M16) gemvMlt8(M-M16, N, A0, lda, X, beta, Y);
618
else if (M) gemvNle4(M, N, A, lda, X, beta, Y);
620
void Mjoin(Mjoin(Mjoin(Mjoin(Mjoin(PATL,gemvN),NM),_x1),BNM),_y1)
621
(const int M, const int N, const SCALAR alpha, const TYPE *A, const int lda,
622
const TYPE *X, const int incX, const SCALAR beta, TYPE *Y, const int incY)
624
gemv16x2(M, N, A, lda, X, beta, Y);