2
* Automatically Tuned Linear Algebra Software v3.8.4
3
* (C) Copyright 1999 R. Clint Whaley
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions, and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
13
* 3. The name of the ATLAS group or the names of its contributers may
14
* not be used to endorse or promote products derived from this
15
* software without specific written permission.
17
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
21
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
* POSSIBILITY OF SUCH DAMAGE.
31
#include "atlas_misc.h"
32
#include "atlas_lvl2.h"
33
#include "atlas_prefetch.h"
35
static void ger_Mle8(const int M, const int N, const TYPE *X,
36
const TYPE *Y, const int incY, TYPE *A, const int lda)
38
const TYPE *stY = Y + incY*N;
39
register TYPE y0, x0, x1, x2, x3, x4, x5, x6, x7;
59
*A += y0 * x0; Y += incY;
73
A[1] += y0 * x1; Y += incY;
74
A[2] += y0 * x2; A += lda;
87
A[1] += y0 * x1; Y += incY;
89
A[3] += y0 * x3; A += lda;
103
A[1] += y0 * x1; Y += incY;
106
A[4] += y0 * x4; A += lda;
121
A[1] += y0 * x1; Y += incY;
125
A[5] += y0 * x5; A += lda;
142
A[2] += y0 * x2; Y += incY;
146
A[6] += y0 * x6; A += lda;
164
A[2] += y0 * x2; Y += incY;
169
A[7] += y0 * x7; A += lda;
177
static void ger_Nle4(const int M, const int N, const TYPE *X,
178
const TYPE *Y, const int incY, TYPE *A, const int lda)
180
register TYPE y0, y1, y2, y3, x0;
181
TYPE *A0 = A, *A1 = A+lda, *A2 = A1+lda, *A3 = A2+lda;
188
for (i=0; i != M; i++) A0[i] += y0 * X[i];
191
y0 = *Y; y1 = Y[incY];
192
for (i=0; i != M; i++)
200
y0 = *Y; y1 = Y[incY]; y2 = Y[incY<<1];
201
for (i=0; i != M; i++)
210
y0 = *Y; y1 = Y[incY]; y2 = Y[incY+incY]; y3 = Y[(incY<<1)+incY];
211
for (i=0; i != M; i++)
223
void Mjoin(PATL,ger1_a1_x1_yX)
224
(const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX,
225
const TYPE *Y, const int incY, TYPE *A, const int lda)
227
const TYPE *stY = Y + N*incY;
232
Mjoin(PATL,axpy)(M, *Y, X, 1, A, 1);
238
else ger_Mle8(M, N, X, Y, incY, A, lda);