32
32
#include "atlas_level3.h"
33
33
#include "atlas_level1.h"
34
34
#include "atlas_lapack.h"
35
#include "atlas_lamch.h"
36
#include Mstr(Mjoin(Mjoin(atlas_,PRE),sysinfo.h))
37
#ifdef ATL_USEPTHREADS
38
#include "atlas_pthreads.h"
39
#include "atlas_tcacheedge.h"
41
#include "atlas_cacheedge.h"
37
46
#define ATL_luMmin 2
47
#define ATL_PCAMin 1500
39
49
#define ATL_luMmin 1
51
#define ATL_PCAMin 256
53
#define ATL_PCAMin 512
49
static int LU2(const int M, TYPE *A, const int lda, int *ipiv)
53
register TYPE t0, t1, scal0, scal1, amax=ATL_rzero;
56
*ipiv = ip = cblas_iamax(M, A, 1);
61
A[ip] = *A; A1[ip] = *A1;
63
scal0 = ATL_rone / t0; scal1 = -t1;
64
for (i=1; i != M; i++)
66
t0 = A[i]; t1 = A1[i];
69
A[i] = t0; A1[i] = t1;
70
if (t1 < ATL_rzero) t1 = -t1;
71
if (t1 > amax) { amax = t1; imax = i; }
73
if (amax != ATL_rzero)
76
t0 = A[imax]; t1 = A1[imax];
77
A[imax] = A[1]; A1[imax] = A1[1];
78
A[1] = t0; A1[1] = t1;
79
cblas_scal(M-2, ATL_rone/t1, A1+2, 1);
83
if (imax != -1) ipiv[1] = imax;
92
static int LU2(const int M, TYPE *A, const int lda, int *ipiv)
96
register TYPE t0, t1, scal0, scal1, amax=ATL_rzero;
99
*ipiv = ip = cblas_iamax(M, A, 1);
104
A[ip] = *A; A1[ip] = *A1;
106
scal0 = ATL_rone / t0; scal1 = -t1;
107
for (i=1; i != M; i++)
109
t0 = A[i]; t1 = A1[i];
112
A[i] = t0; A1[i] = t1;
113
if (t1 < ATL_rzero) t1 = -t1;
114
if (t1 > amax) { amax = t1; imax = i; }
116
if (amax != ATL_rzero)
119
t0 = A[imax]; t1 = A1[imax];
120
A[imax] = A[1]; A1[imax] = A1[1];
121
A[1] = t0; A1[1] = t1;
122
cblas_scal(M-2, ATL_rone/t1, A1+2, 1);
63
static int LU2(const int M, TYPE *A, const int lda, int *ipiv)
65
* factors 2 cols of LU, applies all updated to 2nd call during ininitial
71
register TYPE t0, t1, scal0, scal1, amax=ATL_rzero;
74
*ipiv = ip = cblas_iamax(M, A, 1);
78
if (Mabs(t0) >= ATL_laSAFMIN)
81
A[ip] = *A; A1[ip] = *A1;
83
scal0 = ATL_rone / t0; scal1 = -t1;
84
for (i=1; i != M; i++)
86
t0 = A[i]; t1 = A1[i];
89
A[i] = t0; A1[i] = t1;
90
if (t1 < ATL_rzero) t1 = -t1;
91
if (t1 > amax) { amax = t1; imax = i; }
94
else /* can't safely invert pivot, so must actually divide */
97
A[ip] = *A; A1[ip] = *A1;
99
scal0 = t0; scal1 = -t1;
100
for (i=1; i != M; i++)
102
t0 = A[i]; t1 = A1[i];
105
A[i] = t0; A1[i] = t1;
106
if (t1 < ATL_rzero) t1 = -t1;
107
if (t1 > amax) { amax = t1; imax = i; }
110
if (amax != ATL_rzero)
113
t0 = A[imax]; t1 = A1[imax];
114
A[imax] = A[1]; A1[imax] = A1[1];
115
A[1] = t0; A1[1] = t1;
116
if (Mabs(t1) >= ATL_laSAFMIN)
117
cblas_scal(M-2, ATL_rone/t1, A1+2, 1);
119
for (i=2; i < M; i++)
153
static int LU2(const int M, TYPE *A, const int lda, int *ipiv)
157
register TYPE t0, t1, scal0, scal1, amax=ATL_rzero;
160
*ipiv = ip = cblas_iamax(M, A, 1);
165
A[ip] = *A; A1[ip] = *A1;
167
scal0 = ATL_rone / t0; scal1 = -t1;
172
scal0 = ATL_rone; scal1 = -(*A1);
174
for (i=1; i != M; i++)
176
t0 = A[i]; t1 = A1[i];
179
A[i] = t0; A1[i] = t1;
180
if (t1 < ATL_rzero) t1 = -t1;
181
if (t1 > amax) { amax = t1; imax = i; }
183
if (amax != ATL_rzero)
186
t0 = A[imax]; t1 = A1[imax];
187
A[imax] = A[1]; A1[imax] = A1[1];
188
A[1] = t0; A1[1] = t1;
189
cblas_scal(M-2, ATL_rone/t1, A1+2, 1);
193
if (imax != -1) ipiv[1] = imax;
200
154
#define MySwap(N_, A_, lda_, ip_) \
202
156
TYPE t0_, t1_, t2_, t3_; \
314
268
const int MN = Mmin(M, N);
315
int Nleft, Nright, i, ierr=0;
269
int Nleft, Nright, k, i, ierr=0;
317
271
const TYPE one[2] = {ATL_rone, ATL_rzero};
318
272
const TYPE none[2] = {ATL_rnone, ATL_rzero};
281
if (((size_t)M)*N <= ATL_L1elts)
282
return(Mjoin(PATL,getf2)(M, N, A, lda, ipiv));
283
#if defined(ATL_USEPTHREADS) && defined(ATL_USEPCA)
284
if (N <= (NB<<2) && N >= 16 && M-N >= ATL_PCAMin &&
285
((size_t)ATL_MulBySize(M)*N) <= CacheEdge*ATL_NTHREADS)
288
ierr = Mjoin(PATL,tgetf2)(M, N, A, lda, ipiv);
290
ierr = Mjoin(PATL,tgetf2_nocp)(M, N, A, lda, ipiv);
327
294
if (MN > ATL_luMmin)
383
354
if (tmp[0] != ATL_rzero || tmp[1] != ATL_rzero)
385
ATL_CplxInv(tmp, inv);
386
cblas_scal(M, inv, A, 1);
356
if (ATL_lapy2(tmp[0], tmp[1]) >= ATL_laSAFMIN)
358
ATL_CplxInv(tmp, inv);
359
cblas_scal(M, inv, A, 1);
362
Mjoin(PATL,cplxdivide)(M, tmp, A, 1, A, 1);