3
* -- SuperLU routine (version 3.0) --
4
* Univ. of California Berkeley, Xerox Palo Alto Research Center,
5
* and Lawrence Berkeley National Lab.
10
* File name: ssp_blas2.c
11
* Purpose: Sparse BLAS 2, using some dense BLAS 2 operations.
14
#include "slu_sdefs.h"
19
void susolve(int, int, float*, float*);
20
void slsolve(int, int, float*, float*);
21
void smatvec(int, int, int, float*, float*, float*);
25
sp_strsv(char *uplo, char *trans, char *diag, SuperMatrix *L,
26
SuperMatrix *U, float *x, SuperLUStat_t *stat, int *info)
32
* sp_strsv() solves one of the systems of equations
33
* A*x = b, or A'*x = b,
34
* where b and x are n element vectors and A is a sparse unit , or
35
* non-unit, upper or lower triangular matrix.
36
* No test for singularity or near-singularity is included in this
37
* routine. Such tests must be performed before calling this routine.
42
* uplo - (input) char*
43
* On entry, uplo specifies whether the matrix is an upper or
44
* lower triangular matrix as follows:
45
* uplo = 'U' or 'u' A is an upper triangular matrix.
46
* uplo = 'L' or 'l' A is a lower triangular matrix.
48
* trans - (input) char*
49
* On entry, trans specifies the equations to be solved as
51
* trans = 'N' or 'n' A*x = b.
52
* trans = 'T' or 't' A'*x = b.
53
* trans = 'C' or 'c' A'*x = b.
55
* diag - (input) char*
56
* On entry, diag specifies whether or not A is unit
57
* triangular as follows:
58
* diag = 'U' or 'u' A is assumed to be unit triangular.
59
* diag = 'N' or 'n' A is not assumed to be unit
62
* L - (input) SuperMatrix*
63
* The factor L from the factorization Pr*A*Pc=L*U. Use
64
* compressed row subscripts storage for supernodes,
65
* i.e., L has types: Stype = SC, Dtype = SLU_S, Mtype = TRLU.
67
* U - (input) SuperMatrix*
68
* The factor U from the factorization Pr*A*Pc=L*U.
69
* U has types: Stype = NC, Dtype = SLU_S, Mtype = TRU.
71
* x - (input/output) float*
72
* Before entry, the incremented array X must contain the n
73
* element right-hand side vector b. On exit, X is overwritten
74
* with the solution vector x.
76
* info - (output) int*
77
* If *info = -i, the i-th argument had an illegal value.
81
_fcd ftcs1 = _cptofcd("L", strlen("L")),
82
ftcs2 = _cptofcd("N", strlen("N")),
83
ftcs3 = _cptofcd("U", strlen("U"));
88
int incx = 1, incy = 1;
89
float alpha = 1.0, beta = 1.0;
91
int fsupc, nsupr, nsupc, luptr, istart, irow;
96
/* Test the input parameters */
98
if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1;
99
else if ( !lsame_(trans, "N") && !lsame_(trans, "T") &&
100
!lsame_(trans, "C")) *info = -2;
101
else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3;
102
else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4;
103
else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5;
106
xerbla_("sp_strsv", &i);
111
Lval = Lstore->nzval;
113
Uval = Ustore->nzval;
116
if ( !(work = floatCalloc(L->nrow)) )
117
ABORT("Malloc fails for work in sp_strsv().");
119
if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */
121
if ( lsame_(uplo, "L") ) {
122
/* Form x := inv(L)*x */
123
if ( L->nrow == 0 ) return 0; /* Quick return */
125
for (k = 0; k <= Lstore->nsuper; k++) {
126
fsupc = L_FST_SUPC(k);
127
istart = L_SUB_START(fsupc);
128
nsupr = L_SUB_START(fsupc+1) - istart;
129
nsupc = L_FST_SUPC(k+1) - fsupc;
130
luptr = L_NZ_START(fsupc);
131
nrow = nsupr - nsupc;
133
solve_ops += nsupc * (nsupc - 1);
134
solve_ops += 2 * nrow * nsupc;
137
for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) {
140
x[irow] -= x[fsupc] * Lval[luptr];
143
#ifdef USE_VENDOR_BLAS
145
STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
148
SGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc],
149
&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
151
strsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr,
154
sgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc],
155
&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
158
slsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]);
160
smatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc],
161
&x[fsupc], &work[0] );
164
iptr = istart + nsupc;
165
for (i = 0; i < nrow; ++i, ++iptr) {
167
x[irow] -= work[i]; /* Scatter */
175
/* Form x := inv(U)*x */
177
if ( U->nrow == 0 ) return 0; /* Quick return */
179
for (k = Lstore->nsuper; k >= 0; k--) {
180
fsupc = L_FST_SUPC(k);
181
nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
182
nsupc = L_FST_SUPC(k+1) - fsupc;
183
luptr = L_NZ_START(fsupc);
185
solve_ops += nsupc * (nsupc + 1);
188
x[fsupc] /= Lval[luptr];
189
for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) {
191
x[irow] -= x[fsupc] * Uval[i];
194
#ifdef USE_VENDOR_BLAS
196
STRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr,
199
strsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr,
203
susolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] );
206
for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
207
solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
208
for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1);
211
x[irow] -= x[jcol] * Uval[i];
218
} else { /* Form x := inv(A')*x */
220
if ( lsame_(uplo, "L") ) {
221
/* Form x := inv(L')*x */
222
if ( L->nrow == 0 ) return 0; /* Quick return */
224
for (k = Lstore->nsuper; k >= 0; --k) {
225
fsupc = L_FST_SUPC(k);
226
istart = L_SUB_START(fsupc);
227
nsupr = L_SUB_START(fsupc+1) - istart;
228
nsupc = L_FST_SUPC(k+1) - fsupc;
229
luptr = L_NZ_START(fsupc);
231
solve_ops += 2 * (nsupr - nsupc) * nsupc;
233
for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
234
iptr = istart + nsupc;
235
for (i = L_NZ_START(jcol) + nsupc;
236
i < L_NZ_START(jcol+1); i++) {
238
x[jcol] -= x[irow] * Lval[i];
244
solve_ops += nsupc * (nsupc - 1);
246
ftcs1 = _cptofcd("L", strlen("L"));
247
ftcs2 = _cptofcd("T", strlen("T"));
248
ftcs3 = _cptofcd("U", strlen("U"));
249
STRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
252
strsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
258
/* Form x := inv(U')*x */
259
if ( U->nrow == 0 ) return 0; /* Quick return */
261
for (k = 0; k <= Lstore->nsuper; k++) {
262
fsupc = L_FST_SUPC(k);
263
nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
264
nsupc = L_FST_SUPC(k+1) - fsupc;
265
luptr = L_NZ_START(fsupc);
267
for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
268
solve_ops += 2*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
269
for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) {
271
x[jcol] -= x[irow] * Uval[i];
275
solve_ops += nsupc * (nsupc + 1);
278
x[fsupc] /= Lval[luptr];
281
ftcs1 = _cptofcd("U", strlen("U"));
282
ftcs2 = _cptofcd("T", strlen("T"));
283
ftcs3 = _cptofcd("N", strlen("N"));
284
STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
287
strsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
295
stat->ops[SOLVE] += solve_ops;
304
sp_sgemv(char *trans, float alpha, SuperMatrix *A, float *x,
305
int incx, float beta, float *y, int incy)
310
sp_sgemv() performs one of the matrix-vector operations
311
y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y,
312
where alpha and beta are scalars, x and y are vectors and A is a
313
sparse A->nrow by A->ncol matrix.
318
TRANS - (input) char*
319
On entry, TRANS specifies the operation to be performed as
321
TRANS = 'N' or 'n' y := alpha*A*x + beta*y.
322
TRANS = 'T' or 't' y := alpha*A'*x + beta*y.
323
TRANS = 'C' or 'c' y := alpha*A'*x + beta*y.
325
ALPHA - (input) float
326
On entry, ALPHA specifies the scalar alpha.
328
A - (input) SuperMatrix*
329
Matrix A with a sparse format, of dimension (A->nrow, A->ncol).
330
Currently, the type of A can be:
331
Stype = NC or NCP; Dtype = SLU_S; Mtype = GE.
332
In the future, more general A can be handled.
334
X - (input) float*, array of DIMENSION at least
335
( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
337
( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
338
Before entry, the incremented array X must contain the
342
On entry, INCX specifies the increment for the elements of
343
X. INCX must not be zero.
346
On entry, BETA specifies the scalar beta. When BETA is
347
supplied as zero then Y need not be set on input.
349
Y - (output) float*, array of DIMENSION at least
350
( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
352
( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
353
Before entry with BETA non-zero, the incremented array Y
354
must contain the vector y. On exit, Y is overwritten by the
358
On entry, INCY specifies the increment for the elements of
359
Y. INCY must not be zero.
361
==== Sparse Level 2 Blas routine.
364
/* Local variables */
369
int lenx, leny, i, j, irow;
370
int iy, jx, jy, kx, ky;
373
notran = lsame_(trans, "N");
375
Aval = Astore->nzval;
377
/* Test the input parameters */
379
if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1;
380
else if ( A->nrow < 0 || A->ncol < 0 ) info = 3;
381
else if (incx == 0) info = 5;
382
else if (incy == 0) info = 8;
384
xerbla_("sp_sgemv ", &info);
388
/* Quick return if possible. */
389
if (A->nrow == 0 || A->ncol == 0 || (alpha == 0. && beta == 1.))
392
/* Set LENX and LENY, the lengths of the vectors x and y, and set
393
up the start points in X and Y. */
394
if (lsame_(trans, "N")) {
401
if (incx > 0) kx = 0;
402
else kx = - (lenx - 1) * incx;
403
if (incy > 0) ky = 0;
404
else ky = - (leny - 1) * incy;
406
/* Start the operations. In this version the elements of A are
407
accessed sequentially with one pass through A. */
408
/* First form y := beta*y. */
412
for (i = 0; i < leny; ++i) y[i] = 0.;
414
for (i = 0; i < leny; ++i) y[i] = beta * y[i];
418
for (i = 0; i < leny; ++i) {
423
for (i = 0; i < leny; ++i) {
424
y[iy] = beta * y[iy];
430
if (alpha == 0.) return 0;
433
/* Form y := alpha*A*x + y. */
436
for (j = 0; j < A->ncol; ++j) {
438
temp = alpha * x[jx];
439
for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
440
irow = Astore->rowind[i];
441
y[irow] += temp * Aval[i];
447
ABORT("Not implemented.");
450
/* Form y := alpha*A'*x + y. */
453
for (j = 0; j < A->ncol; ++j) {
455
for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
456
irow = Astore->rowind[i];
457
temp += Aval[i] * x[irow];
459
y[jy] += alpha * temp;
463
ABORT("Not implemented.");