1
#include "atlas_misc.h"
2
#include "atlas_lvl2.h"
4
#include "atlas_reflevel2.h"
8
(const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX,
9
const TYPE *Y, const int incY, TYPE *A, const int lda);
13
static void ATL_gerk_Meq1
14
(const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX,
15
const TYPE *Y, const int incY, TYPE *A, const int lda)
20
if (alpha == ATL_rone) goto ALPHA1;
21
else if (alpha != ATL_rnone) goto ALPHAX;
25
for (j=0; j < N; j++, A += lda)
40
static void ATL_gerk_Meq2
41
(const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX,
42
const TYPE *Y, const int incY, TYPE *A, const int lda)
45
register TYPE y0, x0, x1;
47
if (alpha == ATL_rone) goto ALPHA1;
48
else if (alpha != ATL_rnone) goto ALPHAX;
53
for (j=0; j < N; j++, A += lda)
72
static void ATL_gerk_Meq3
73
(const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX,
74
const TYPE *Y, const int incY, TYPE *A, const int lda)
77
register TYPE y0, x0, x1, x2;
79
if (alpha == ATL_rone) goto ALPHA1;
80
else if (alpha != ATL_rnone) goto ALPHAX;
86
for (j=0; j < N; j++, A += lda)
109
static void ATL_gerk_Meq4
110
(const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX,
111
const TYPE *Y, const int incY, TYPE *A, const int lda)
114
register TYPE y0, x0, x1, x2, x3;
116
if (alpha == ATL_rone) goto ALPHA1;
117
else if (alpha != ATL_rnone) goto ALPHAX;
124
for (j=0; j < N; j++, A += lda)
151
static void ATL_gerk_Meq5
152
(const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX,
153
const TYPE *Y, const int incY, TYPE *A, const int lda)
156
register TYPE y0, x0, x1, x2, x3, x4;
158
if (alpha == ATL_rone) goto ALPHA1;
159
else if (alpha != ATL_rnone) goto ALPHAX;
167
for (j=0; j < N; j++, A += lda)
198
static void ATL_gerk_Meq6
199
(const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX,
200
const TYPE *Y, const int incY, TYPE *A, const int lda)
203
register TYPE y0, x0, x1, x2, x3, x4, x5;
205
if (alpha == ATL_rone) goto ALPHA1;
206
else if (alpha != ATL_rnone) goto ALPHAX;
215
for (j=0; j < N; j++, A += lda)
250
static void ATL_gerk_Meq7
251
(const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX,
252
const TYPE *Y, const int incY, TYPE *A, const int lda)
255
register TYPE y0, x0, x1, x2, x3, x4, x5, x6;
257
if (alpha == ATL_rone) goto ALPHA1;
258
else if (alpha != ATL_rnone) goto ALPHAX;
268
for (j=0; j < N; j++, A += lda)
307
static void ATL_gerk_Meq8
308
(const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX,
309
const TYPE *Y, const int incY, TYPE *A, const int lda)
312
register TYPE y0, x0, x1, x2, x3, x4, x5, x6, x7;
314
if (alpha == ATL_rone) goto ALPHA1;
315
else if (alpha != ATL_rnone) goto ALPHAX;
326
for (j=0; j < N; j++, A += lda)
369
static void ATL_gerk_Meq9
370
(const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX,
371
const TYPE *Y, const int incY, TYPE *A, const int lda)
374
register TYPE y0, x0, x1, x2, x3, x4, x5, x6, x7, x8;
376
if (alpha == ATL_rone) goto ALPHA1;
377
else if (alpha != ATL_rnone) goto ALPHAX;
389
for (j=0; j < N; j++, A += lda)
436
static void ATL_gerk_Meq10
437
(const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX,
438
const TYPE *Y, const int incY, TYPE *A, const int lda)
441
register TYPE y0, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9;
443
if (alpha == ATL_rone) goto ALPHA1;
444
else if (alpha != ATL_rnone) goto ALPHAX;
457
for (j=0; j < N; j++, A += lda)
508
static void ATL_gerk_Meq11
509
(const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX,
510
const TYPE *Y, const int incY, TYPE *A, const int lda)
513
register TYPE y0, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10;
515
if (alpha == ATL_rone) goto ALPHA1;
516
else if (alpha != ATL_rnone) goto ALPHAX;
530
for (j=0; j < N; j++, A += lda)
569
x10 = X[10*incX] * y0;
585
static void ATL_gerk_Meq12
586
(const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX,
587
const TYPE *Y, const int incY, TYPE *A, const int lda)
590
register TYPE y0, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11;
592
if (alpha == ATL_rone) goto ALPHA1;
593
else if (alpha != ATL_rnone) goto ALPHAX;
608
for (j=0; j < N; j++, A += lda)
648
x10 = X[10*incX] * y0;
650
x11 = X[11*incX] * y0;
667
static void ATL_gerk_Meq13
668
(const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX,
669
const TYPE *Y, const int incY, TYPE *A, const int lda)
672
register TYPE y0, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12;
674
if (alpha == ATL_rone) goto ALPHA1;
675
else if (alpha != ATL_rnone) goto ALPHAX;
691
for (j=0; j < N; j++, A += lda)
732
x10 = X[10*incX] * y0;
734
x11 = X[11*incX] * y0;
736
x12 = X[12*incX] * y0;
754
static void ATL_gerk_Meq14
755
(const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX,
756
const TYPE *Y, const int incY, TYPE *A, const int lda)
759
register TYPE y0, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12;
762
if (alpha == ATL_rone) goto ALPHA1;
763
else if (alpha != ATL_rnone) goto ALPHAX;
780
for (j=0; j < N; j++, A += lda)
822
x10 = X[10*incX] * y0;
824
x11 = X[11*incX] * y0;
826
x12 = X[12*incX] * y0;
828
x13 = X[13*incX] * y0;
847
static void ATL_gerk_Meq15
848
(const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX,
849
const TYPE *Y, const int incY, TYPE *A, const int lda)
852
register TYPE y0, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12;
853
register TYPE x13, x14;
855
if (alpha == ATL_rone) goto ALPHA1;
856
else if (alpha != ATL_rnone) goto ALPHAX;
874
for (j=0; j < N; j++, A += lda)
917
x10 = X[10*incX] * y0;
919
x11 = X[11*incX] * y0;
921
x12 = X[12*incX] * y0;
923
x13 = X[13*incX] * y0;
925
x14 = X[14*incX] * y0;
946
void Mjoin(PATL,gerk_Mlt16)
947
(const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX,
948
const TYPE *Y, const int incY, TYPE *A, const int lda)
950
* ATLAS's normal GER kernels are optimized for long-M, and loop over rows in
951
* the inner loop. To avoid this killing us on short, wide matrices, have
952
* special case code for M < 16. This also allows kernels to assume M >= 16.
955
static gerk_t gerks[15]={ATL_gerk_Meq1, ATL_gerk_Meq2, ATL_gerk_Meq3,
956
ATL_gerk_Meq4, ATL_gerk_Meq5, ATL_gerk_Meq6,
957
ATL_gerk_Meq7, ATL_gerk_Meq8, ATL_gerk_Meq9,
958
ATL_gerk_Meq10, ATL_gerk_Meq11, ATL_gerk_Meq12,
959
ATL_gerk_Meq13, ATL_gerk_Meq14, ATL_gerk_Meq15};
960
if (M < 1 || N < 1 || SCALAR_IS_ZERO(alpha))
964
#elif defined(ATL_GAS_x8632)
970
Mjoin(PATL,gerk_axpy)(M, N, alpha, X, incX, Y, incY, A, lda);
973
gerks[M-1](M, N, alpha, X, incX, Y, incY, A, lda);
976
#else /* complex type */
978
void Mjoin(PATL,gerck_Mlt16)
980
void Mjoin(PATL,gerk_Mlt16)
982
(const int M, const int N, const SCALAR alpha, const TYPE *X, const int incX,
983
const TYPE *Y, const int incY, TYPE *A, const int lda)
986
* For now, complex simply calls refblas for short M, axpy-based for large.
987
* Probably not worth additional instruction load for complex to unroll.
992
Mjoin(PATL,refgerc)(M, N, alpha, X, incX, Y, incY, A, lda);
994
Mjoin(PATL,refgeru)(M, N, alpha, X, incX, Y, incY, A, lda);
999
Mjoin(PATL,gerck_axpy)(M, N, alpha, X, incX, Y, incY, A, lda);
1001
Mjoin(PATL,gerk_axpy)(M, N, alpha, X, incX, Y, incY, A, lda);