2
* Automatically Tuned Linear Algebra Software v3.8.4
3
* (C) Copyright 1999 R. Clint Whaley
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions, and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
13
* 3. The name of the ATLAS group or the names of its contributers may
14
* not be used to endorse or promote products derived from this
15
* software without specific written permission.
17
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
21
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
* POSSIBILITY OF SUCH DAMAGE.
31
#include "atlas_misc.h"
32
#include "atlas_lvl2.h"
34
static void ger_Mle8(const int M, const int N, const TYPE *X,
35
const TYPE *Y, const int incY, TYPE *A, const int lda)
37
const TYPE *stY = Y + incY*N;
38
register TYPE y0, x0, x1, x2, x3, x4, x5, x6, x7;
58
*A += y0 * x0; Y += incY;
72
A[1] += y0 * x1; Y += incY;
73
A[2] += y0 * x2; A += lda;
86
A[1] += y0 * x1; Y += incY;
88
A[3] += y0 * x3; A += lda;
102
A[1] += y0 * x1; Y += incY;
105
A[4] += y0 * x4; A += lda;
120
A[1] += y0 * x1; Y += incY;
124
A[5] += y0 * x5; A += lda;
141
A[2] += y0 * x2; Y += incY;
145
A[6] += y0 * x6; A += lda;
163
A[2] += y0 * x2; Y += incY;
168
A[7] += y0 * x7; A += lda;
176
static void ger_Nle4(const int M, const int N, const TYPE *X,
177
const TYPE *Y, const int incY, TYPE *A, const int lda)
179
register TYPE y0, y1, y2, y3, x0;
180
TYPE *A0 = A, *A1 = A+lda, *A2 = A1+lda, *A3 = A2+lda;
187
for (i=0; i != M; i++) A0[i] += y0 * X[i];
190
y0 = *Y; y1 = Y[incY];
191
for (i=0; i != M; i++)
199
y0 = *Y; y1 = Y[incY]; y2 = Y[incY<<1];
200
for (i=0; i != M; i++)
209
y0 = *Y; y1 = Y[incY]; y2 = Y[incY+incY]; y3 = Y[(incY<<1)+incY];
210
for (i=0; i != M; i++)
222
void Mjoin(PATL,ger1_a1_x1_yX)
223
(const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX,
224
const TYPE *Y, const int incY, TYPE *A, const int lda)
227
const int incy = incY<<2;
228
const TYPE *Y1= Y + incY, *Y2 = Y1 + incY, *Y3 = Y2 + incY;
230
TYPE *A0 = A, *A1 = A + lda, *A2 = A1 + lda, *A3 = A2 + lda;
231
const int M8 = ((M-1)>>3)<<3, mr = M-M8-1;
232
const int N4 = (N>>2)<<2, incAn = (lda<<2) - M + 1;
233
register TYPE m0, m1, m2, m3, x0, y0, y1, y2, y3;
237
for (j=N4; j; j -= 4)
239
y0 = *Y; y1 = *Y1; y2 = *Y2; y3 = *Y3;
241
m0 = y0 * x0; Y += incy;
242
m1 = y1 * x0; Y1 += incy;
243
m2 = y2 * x0; Y2 += incy;
244
m3 = y3 * x0; Y3 += incy;
245
for (i=M8; i; i -= 8)
248
*A0 += m0; m0 = y0 * x0;
249
*A1 += m1; m1 = y1 * x0;
250
*A2 += m2; m2 = y2 * x0;
251
*A3 += m3; m3 = y3 * x0;
253
A0[1] += m0; m0 = y0 * x0;
254
A1[1] += m1; m1 = y1 * x0;
255
A2[1] += m2; m2 = y2 * x0;
256
A3[1] += m3; m3 = y3 * x0;
258
A0[2] += m0; m0 = y0 * x0;
259
A1[2] += m1; m1 = y1 * x0;
260
A2[2] += m2; m2 = y2 * x0;
261
A3[2] += m3; m3 = y3 * x0;
263
A0[3] += m0; m0 = y0 * x0;
264
A1[3] += m1; m1 = y1 * x0;
265
A2[3] += m2; m2 = y2 * x0;
266
A3[3] += m3; m3 = y3 * x0;
268
A0[4] += m0; m0 = y0 * x0;
269
A1[4] += m1; m1 = y1 * x0;
270
A2[4] += m2; m2 = y2 * x0;
271
A3[4] += m3; m3 = y3 * x0;
273
A0[5] += m0; m0 = y0 * x0;
274
A1[5] += m1; m1 = y1 * x0;
275
A2[5] += m2; m2 = y2 * x0;
276
A3[5] += m3; m3 = y3 * x0;
278
A0[6] += m0; m0 = y0 * x0;
279
A1[6] += m1; m1 = y1 * x0;
280
A2[6] += m2; m2 = y2 * x0;
281
A3[6] += m3; m3 = y3 * x0;
283
A0[7] += m0; m0 = y0 * x0; A0 += 8;
284
A1[7] += m1; m1 = y1 * x0; A1 += 8;
285
A2[7] += m2; m2 = y2 * x0; A2 += 8;
286
A3[7] += m3; m3 = y3 * x0; A3 += 8;
293
*A0++ += m0; m0 = y0 * x0;
294
*A1++ += m1; m1 = y1 * x0;
295
*A2++ += m2; m2 = y2 * x0;
296
*A3++ += m3; m3 = y3 * x0;
299
*A0 += m0; A0 += incAn;
300
*A1 += m1; A1 += incAn;
301
*A2 += m2; A2 += incAn;
302
*A3 += m3; A3 += incAn;
304
if (N-N4) ger_Nle4(M, N-N4, X, Y, incY, A0, lda);
306
else ger_Mle8(M, N, X, Y, incY, A, lda);