3
* -- SuperLU routine (version 3.0) --
4
* Univ. of California Berkeley, Xerox Palo Alto Research Center,
5
* and Lawrence Berkeley National Lab.
10
* File name: zsp_blas2.c
11
* Purpose: Sparse BLAS 2, using some dense BLAS 2 operations.
14
#include "slu_zdefs.h"
19
void zusolve(int, int, doublecomplex*, doublecomplex*);
20
void zlsolve(int, int, doublecomplex*, doublecomplex*);
21
void zmatvec(int, int, int, doublecomplex*, doublecomplex*, doublecomplex*);
25
sp_ztrsv(char *uplo, char *trans, char *diag, SuperMatrix *L,
26
SuperMatrix *U, doublecomplex *x, SuperLUStat_t *stat, int *info)
32
* sp_ztrsv() solves one of the systems of equations
33
* A*x = b, or A'*x = b,
34
* where b and x are n element vectors and A is a sparse unit , or
35
* non-unit, upper or lower triangular matrix.
36
* No test for singularity or near-singularity is included in this
37
* routine. Such tests must be performed before calling this routine.
42
* uplo - (input) char*
43
* On entry, uplo specifies whether the matrix is an upper or
44
* lower triangular matrix as follows:
45
* uplo = 'U' or 'u' A is an upper triangular matrix.
46
* uplo = 'L' or 'l' A is a lower triangular matrix.
48
* trans - (input) char*
49
* On entry, trans specifies the equations to be solved as
51
* trans = 'N' or 'n' A*x = b.
52
* trans = 'T' or 't' A'*x = b.
53
* trans = 'C' or 'c' A^H*x = b.
55
* diag - (input) char*
56
* On entry, diag specifies whether or not A is unit
57
* triangular as follows:
58
* diag = 'U' or 'u' A is assumed to be unit triangular.
59
* diag = 'N' or 'n' A is not assumed to be unit
62
* L - (input) SuperMatrix*
63
* The factor L from the factorization Pr*A*Pc=L*U. Use
64
* compressed row subscripts storage for supernodes,
65
* i.e., L has types: Stype = SC, Dtype = SLU_Z, Mtype = TRLU.
67
* U - (input) SuperMatrix*
68
* The factor U from the factorization Pr*A*Pc=L*U.
69
* U has types: Stype = NC, Dtype = SLU_Z, Mtype = TRU.
71
* x - (input/output) doublecomplex*
72
* Before entry, the incremented array X must contain the n
73
* element right-hand side vector b. On exit, X is overwritten
74
* with the solution vector x.
76
* info - (output) int*
77
* If *info = -i, the i-th argument had an illegal value.
81
_fcd ftcs1 = _cptofcd("L", strlen("L")),
82
ftcs2 = _cptofcd("N", strlen("N")),
83
ftcs3 = _cptofcd("U", strlen("U"));
87
doublecomplex *Lval, *Uval;
88
int incx = 1, incy = 1;
90
doublecomplex alpha = {1.0, 0.0}, beta = {1.0, 0.0};
91
doublecomplex comp_zero = {0.0, 0.0};
93
int fsupc, nsupr, nsupc, luptr, istart, irow;
98
/* Test the input parameters */
100
if ( !lsame_(uplo,"L") && !lsame_(uplo, "U") ) *info = -1;
101
else if ( !lsame_(trans, "N") && !lsame_(trans, "T") &&
102
!lsame_(trans, "C")) *info = -2;
103
else if ( !lsame_(diag, "U") && !lsame_(diag, "N") ) *info = -3;
104
else if ( L->nrow != L->ncol || L->nrow < 0 ) *info = -4;
105
else if ( U->nrow != U->ncol || U->nrow < 0 ) *info = -5;
108
xerbla_("sp_ztrsv", &i);
113
Lval = Lstore->nzval;
115
Uval = Ustore->nzval;
118
if ( !(work = doublecomplexCalloc(L->nrow)) )
119
ABORT("Malloc fails for work in sp_ztrsv().");
121
if ( lsame_(trans, "N") ) { /* Form x := inv(A)*x. */
123
if ( lsame_(uplo, "L") ) {
124
/* Form x := inv(L)*x */
125
if ( L->nrow == 0 ) return 0; /* Quick return */
127
for (k = 0; k <= Lstore->nsuper; k++) {
128
fsupc = L_FST_SUPC(k);
129
istart = L_SUB_START(fsupc);
130
nsupr = L_SUB_START(fsupc+1) - istart;
131
nsupc = L_FST_SUPC(k+1) - fsupc;
132
luptr = L_NZ_START(fsupc);
133
nrow = nsupr - nsupc;
135
/* 1 z_div costs 10 flops */
136
solve_ops += 4 * nsupc * (nsupc - 1) + 10 * nsupc;
137
solve_ops += 8 * nrow * nsupc;
140
for (iptr=istart+1; iptr < L_SUB_START(fsupc+1); ++iptr) {
143
zz_mult(&comp_zero, &x[fsupc], &Lval[luptr]);
144
z_sub(&x[irow], &x[irow], &comp_zero);
147
#ifdef USE_VENDOR_BLAS
149
CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
152
CGEMV(ftcs2, &nrow, &nsupc, &alpha, &Lval[luptr+nsupc],
153
&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
155
ztrsv_("L", "N", "U", &nsupc, &Lval[luptr], &nsupr,
158
zgemv_("N", &nrow, &nsupc, &alpha, &Lval[luptr+nsupc],
159
&nsupr, &x[fsupc], &incx, &beta, &work[0], &incy);
162
zlsolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc]);
164
zmatvec ( nsupr, nsupr-nsupc, nsupc, &Lval[luptr+nsupc],
165
&x[fsupc], &work[0] );
168
iptr = istart + nsupc;
169
for (i = 0; i < nrow; ++i, ++iptr) {
171
z_sub(&x[irow], &x[irow], &work[i]); /* Scatter */
179
/* Form x := inv(U)*x */
181
if ( U->nrow == 0 ) return 0; /* Quick return */
183
for (k = Lstore->nsuper; k >= 0; k--) {
184
fsupc = L_FST_SUPC(k);
185
nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
186
nsupc = L_FST_SUPC(k+1) - fsupc;
187
luptr = L_NZ_START(fsupc);
189
/* 1 z_div costs 10 flops */
190
solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc;
193
z_div(&x[fsupc], &x[fsupc], &Lval[luptr]);
194
for (i = U_NZ_START(fsupc); i < U_NZ_START(fsupc+1); ++i) {
196
zz_mult(&comp_zero, &x[fsupc], &Uval[i]);
197
z_sub(&x[irow], &x[irow], &comp_zero);
200
#ifdef USE_VENDOR_BLAS
202
CTRSV(ftcs3, ftcs2, ftcs2, &nsupc, &Lval[luptr], &nsupr,
205
ztrsv_("U", "N", "N", &nsupc, &Lval[luptr], &nsupr,
209
zusolve ( nsupr, nsupc, &Lval[luptr], &x[fsupc] );
212
for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
213
solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
214
for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1);
217
zz_mult(&comp_zero, &x[jcol], &Uval[i]);
218
z_sub(&x[irow], &x[irow], &comp_zero);
225
} else if ( lsame_(trans, "T") ) { /* Form x := inv(A')*x */
227
if ( lsame_(uplo, "L") ) {
228
/* Form x := inv(L')*x */
229
if ( L->nrow == 0 ) return 0; /* Quick return */
231
for (k = Lstore->nsuper; k >= 0; --k) {
232
fsupc = L_FST_SUPC(k);
233
istart = L_SUB_START(fsupc);
234
nsupr = L_SUB_START(fsupc+1) - istart;
235
nsupc = L_FST_SUPC(k+1) - fsupc;
236
luptr = L_NZ_START(fsupc);
238
solve_ops += 8 * (nsupr - nsupc) * nsupc;
240
for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
241
iptr = istart + nsupc;
242
for (i = L_NZ_START(jcol) + nsupc;
243
i < L_NZ_START(jcol+1); i++) {
245
zz_mult(&comp_zero, &x[irow], &Lval[i]);
246
z_sub(&x[jcol], &x[jcol], &comp_zero);
252
solve_ops += 4 * nsupc * (nsupc - 1);
254
ftcs1 = _cptofcd("L", strlen("L"));
255
ftcs2 = _cptofcd("T", strlen("T"));
256
ftcs3 = _cptofcd("U", strlen("U"));
257
CTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
260
ztrsv_("L", "T", "U", &nsupc, &Lval[luptr], &nsupr,
266
/* Form x := inv(U')*x */
267
if ( U->nrow == 0 ) return 0; /* Quick return */
269
for (k = 0; k <= Lstore->nsuper; k++) {
270
fsupc = L_FST_SUPC(k);
271
nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
272
nsupc = L_FST_SUPC(k+1) - fsupc;
273
luptr = L_NZ_START(fsupc);
275
for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
276
solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
277
for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) {
279
zz_mult(&comp_zero, &x[irow], &Uval[i]);
280
z_sub(&x[jcol], &x[jcol], &comp_zero);
284
/* 1 z_div costs 10 flops */
285
solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc;
288
z_div(&x[fsupc], &x[fsupc], &Lval[luptr]);
291
ftcs1 = _cptofcd("U", strlen("U"));
292
ftcs2 = _cptofcd("T", strlen("T"));
293
ftcs3 = _cptofcd("N", strlen("N"));
294
CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
297
ztrsv_("U", "T", "N", &nsupc, &Lval[luptr], &nsupr,
303
} else { /* Form x := conj(inv(A'))*x */
305
if ( lsame_(uplo, "L") ) {
306
/* Form x := conj(inv(L'))*x */
307
if ( L->nrow == 0 ) return 0; /* Quick return */
309
for (k = Lstore->nsuper; k >= 0; --k) {
310
fsupc = L_FST_SUPC(k);
311
istart = L_SUB_START(fsupc);
312
nsupr = L_SUB_START(fsupc+1) - istart;
313
nsupc = L_FST_SUPC(k+1) - fsupc;
314
luptr = L_NZ_START(fsupc);
316
solve_ops += 8 * (nsupr - nsupc) * nsupc;
318
for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
319
iptr = istart + nsupc;
320
for (i = L_NZ_START(jcol) + nsupc;
321
i < L_NZ_START(jcol+1); i++) {
323
zz_conj(&temp, &Lval[i]);
324
zz_mult(&comp_zero, &x[irow], &temp);
325
z_sub(&x[jcol], &x[jcol], &comp_zero);
331
solve_ops += 4 * nsupc * (nsupc - 1);
333
ftcs1 = _cptofcd("L", strlen("L"));
334
ftcs2 = _cptofcd(trans, strlen("T"));
335
ftcs3 = _cptofcd("U", strlen("U"));
336
ZTRSV(ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
339
ztrsv_("L", trans, "U", &nsupc, &Lval[luptr], &nsupr,
345
/* Form x := conj(inv(U'))*x */
346
if ( U->nrow == 0 ) return 0; /* Quick return */
348
for (k = 0; k <= Lstore->nsuper; k++) {
349
fsupc = L_FST_SUPC(k);
350
nsupr = L_SUB_START(fsupc+1) - L_SUB_START(fsupc);
351
nsupc = L_FST_SUPC(k+1) - fsupc;
352
luptr = L_NZ_START(fsupc);
354
for (jcol = fsupc; jcol < L_FST_SUPC(k+1); jcol++) {
355
solve_ops += 8*(U_NZ_START(jcol+1) - U_NZ_START(jcol));
356
for (i = U_NZ_START(jcol); i < U_NZ_START(jcol+1); i++) {
358
zz_conj(&temp, &Uval[i]);
359
zz_mult(&comp_zero, &x[irow], &temp);
360
z_sub(&x[jcol], &x[jcol], &comp_zero);
364
/* 1 z_div costs 10 flops */
365
solve_ops += 4 * nsupc * (nsupc + 1) + 10 * nsupc;
368
zz_conj(&temp, &Lval[luptr]);
369
z_div(&x[fsupc], &x[fsupc], &temp);
372
ftcs1 = _cptofcd("U", strlen("U"));
373
ftcs2 = _cptofcd(trans, strlen("T"));
374
ftcs3 = _cptofcd("N", strlen("N"));
375
ZTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &Lval[luptr], &nsupr,
378
ztrsv_("U", trans, "N", &nsupc, &Lval[luptr], &nsupr,
386
stat->ops[SOLVE] += solve_ops;
394
sp_zgemv(char *trans, doublecomplex alpha, SuperMatrix *A, doublecomplex *x,
395
int incx, doublecomplex beta, doublecomplex *y, int incy)
400
sp_zgemv() performs one of the matrix-vector operations
401
y := alpha*A*x + beta*y, or y := alpha*A'*x + beta*y,
402
where alpha and beta are scalars, x and y are vectors and A is a
403
sparse A->nrow by A->ncol matrix.
408
TRANS - (input) char*
409
On entry, TRANS specifies the operation to be performed as
411
TRANS = 'N' or 'n' y := alpha*A*x + beta*y.
412
TRANS = 'T' or 't' y := alpha*A'*x + beta*y.
413
TRANS = 'C' or 'c' y := alpha*A'*x + beta*y.
415
ALPHA - (input) doublecomplex
416
On entry, ALPHA specifies the scalar alpha.
418
A - (input) SuperMatrix*
419
Before entry, the leading m by n part of the array A must
420
contain the matrix of coefficients.
422
X - (input) doublecomplex*, array of DIMENSION at least
423
( 1 + ( n - 1 )*abs( INCX ) ) when TRANS = 'N' or 'n'
425
( 1 + ( m - 1 )*abs( INCX ) ) otherwise.
426
Before entry, the incremented array X must contain the
430
On entry, INCX specifies the increment for the elements of
431
X. INCX must not be zero.
433
BETA - (input) doublecomplex
434
On entry, BETA specifies the scalar beta. When BETA is
435
supplied as zero then Y need not be set on input.
437
Y - (output) doublecomplex*, array of DIMENSION at least
438
( 1 + ( m - 1 )*abs( INCY ) ) when TRANS = 'N' or 'n'
440
( 1 + ( n - 1 )*abs( INCY ) ) otherwise.
441
Before entry with BETA non-zero, the incremented array Y
442
must contain the vector y. On exit, Y is overwritten by the
446
On entry, INCY specifies the increment for the elements of
447
Y. INCY must not be zero.
449
==== Sparse Level 2 Blas routine.
452
/* Local variables */
456
doublecomplex temp, temp1;
457
int lenx, leny, i, j, irow;
458
int iy, jx, jy, kx, ky;
460
doublecomplex comp_zero = {0.0, 0.0};
461
doublecomplex comp_one = {1.0, 0.0};
463
notran = lsame_(trans, "N");
465
Aval = Astore->nzval;
467
/* Test the input parameters */
469
if ( !notran && !lsame_(trans, "T") && !lsame_(trans, "C")) info = 1;
470
else if ( A->nrow < 0 || A->ncol < 0 ) info = 3;
471
else if (incx == 0) info = 5;
472
else if (incy == 0) info = 8;
474
xerbla_("sp_zgemv ", &info);
478
/* Quick return if possible. */
479
if (A->nrow == 0 || A->ncol == 0 ||
480
z_eq(&alpha, &comp_zero) &&
481
z_eq(&beta, &comp_one))
485
/* Set LENX and LENY, the lengths of the vectors x and y, and set
486
up the start points in X and Y. */
487
if (lsame_(trans, "N")) {
494
if (incx > 0) kx = 0;
495
else kx = - (lenx - 1) * incx;
496
if (incy > 0) ky = 0;
497
else ky = - (leny - 1) * incy;
499
/* Start the operations. In this version the elements of A are
500
accessed sequentially with one pass through A. */
501
/* First form y := beta*y. */
502
if ( !z_eq(&beta, &comp_one) ) {
504
if ( z_eq(&beta, &comp_zero) )
505
for (i = 0; i < leny; ++i) y[i] = comp_zero;
507
for (i = 0; i < leny; ++i)
508
zz_mult(&y[i], &beta, &y[i]);
511
if ( z_eq(&beta, &comp_zero) )
512
for (i = 0; i < leny; ++i) {
517
for (i = 0; i < leny; ++i) {
518
zz_mult(&y[iy], &beta, &y[iy]);
524
if ( z_eq(&alpha, &comp_zero) ) return 0;
527
/* Form y := alpha*A*x + y. */
530
for (j = 0; j < A->ncol; ++j) {
531
if ( !z_eq(&x[jx], &comp_zero) ) {
532
zz_mult(&temp, &alpha, &x[jx]);
533
for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
534
irow = Astore->rowind[i];
535
zz_mult(&temp1, &temp, &Aval[i]);
536
z_add(&y[irow], &y[irow], &temp1);
542
ABORT("Not implemented.");
545
/* Form y := alpha*A'*x + y. */
548
for (j = 0; j < A->ncol; ++j) {
550
for (i = Astore->colptr[j]; i < Astore->colptr[j+1]; ++i) {
551
irow = Astore->rowind[i];
552
zz_mult(&temp1, &Aval[i], &x[irow]);
553
z_add(&temp, &temp, &temp1);
555
zz_mult(&temp1, &alpha, &temp);
556
z_add(&y[jy], &y[jy], &temp1);
560
ABORT("Not implemented.");