3
* -- SuperLU routine (version 3.0) --
4
* Univ. of California Berkeley, Xerox Palo Alto Research Center,
5
* and Lawrence Berkeley National Lab.
10
Copyright (c) 1994 by Xerox Corporation. All rights reserved.
12
THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
13
EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
15
Permission is hereby granted to use or copy this program for any
16
purpose, provided the above notices are retained on all copies.
17
Permission to modify the code and to distribute modified code is
18
granted, provided the above notices are retained, and a notice that
19
the code was modified is included with the above copyright notice.
29
void cusolve(int, int, complex*, complex*);
30
void clsolve(int, int, complex*, complex*);
31
void cmatvec(int, int, int, complex*, complex*, complex*);
35
/* Return value: 0 - successful return
36
* > 0 - number of bytes allocated when run out of space
40
const int jcol, /* in */
41
const int nseg, /* in */
42
complex *dense, /* in */
43
complex *tempv, /* working array */
46
int fpanelc, /* in -- first column in the current panel */
47
GlobalLU_t *Glu, /* modified */
48
SuperLUStat_t *stat /* output */
54
* Performs numeric block updates (sup-col) in topological order.
55
* It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
56
* Special processing on the supernodal portion of L\U[*,j]
60
_fcd ftcs1 = _cptofcd("L", strlen("L")),
61
ftcs2 = _cptofcd("N", strlen("N")),
62
ftcs3 = _cptofcd("U", strlen("U"));
64
int incx = 1, incy = 1;
67
/* krep = representative of current k-th supernode
68
* fsupc = first supernodal column
69
* nsupc = no of columns in supernode
70
* nsupr = no of rows in supernode (used as leading dimension)
71
* luptr = location of supernodal LU-block in storage
72
* kfnz = first nonz in the k-th supernodal segment
73
* no_zeros = no of leading zeros in a supernodal U-segment
75
complex ukj, ukj1, ukj2;
76
int luptr, luptr1, luptr2;
77
int fsupc, nsupc, nsupr, segsze;
78
int nrow; /* No of rows in the matrix of matrix-vector */
79
int jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno;
80
register int lptr, kfnz, isub, irow, i;
81
register int no_zeros, new_next;
83
int fst_col; /* First column within small LU update */
84
int d_fsupc; /* Distance between the first column of the current
85
panel and the first column of the current snode. */
92
complex zero = {0.0, 0.0};
93
complex one = {1.0, 0.0};
94
complex none = {-1.0, 0.0};
95
complex comp_temp, comp_temp1;
97
flops_t *ops = stat->ops;
104
xlusup = Glu->xlusup;
105
nzlumax = Glu->nzlumax;
107
jsupno = supno[jcol];
110
* For each nonz supernode segment of U[*,j] in topological order
113
for (ksub = 0; ksub < nseg; ksub++) {
117
ksupno = supno[krep];
118
if ( jsupno != ksupno ) { /* Outside the rectangular supernode */
120
fsupc = xsup[ksupno];
121
fst_col = SUPERLU_MAX ( fsupc, fpanelc );
123
/* Distance from the current supernode to the current panel;
124
d_fsupc=0 if fsupc > fpanelc. */
125
d_fsupc = fst_col - fsupc;
127
luptr = xlusup[fst_col] + d_fsupc;
128
lptr = xlsub[fsupc] + d_fsupc;
131
kfnz = SUPERLU_MAX ( kfnz, fpanelc );
133
segsze = krep - kfnz + 1;
134
nsupc = krep - fst_col + 1;
135
nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */
136
nrow = nsupr - d_fsupc - nsupc;
137
krep_ind = lptr + nsupc - 1;
143
* Case 1: Update U-segment of size 1 -- col-col update
146
ukj = dense[lsub[krep_ind]];
147
luptr += nsupr*(nsupc-1) + nsupc;
149
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
151
cc_mult(&comp_temp, &ukj, &lusup[luptr]);
152
c_sub(&dense[irow], &dense[irow], &comp_temp);
156
} else if ( segsze <= 3 ) {
157
ukj = dense[lsub[krep_ind]];
158
luptr += nsupr*(nsupc-1) + nsupc-1;
159
ukj1 = dense[lsub[krep_ind - 1]];
160
luptr1 = luptr - nsupr;
162
if ( segsze == 2 ) { /* Case 2: 2cols-col update */
163
cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
164
c_sub(&ukj, &ukj, &comp_temp);
165
dense[lsub[krep_ind]] = ukj;
166
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
170
cc_mult(&comp_temp, &ukj, &lusup[luptr]);
171
cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
172
c_add(&comp_temp, &comp_temp, &comp_temp1);
173
c_sub(&dense[irow], &dense[irow], &comp_temp);
175
} else { /* Case 3: 3cols-col update */
176
ukj2 = dense[lsub[krep_ind - 2]];
177
luptr2 = luptr1 - nsupr;
178
cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
179
c_sub(&ukj1, &ukj1, &comp_temp);
181
cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
182
cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
183
c_add(&comp_temp, &comp_temp, &comp_temp1);
184
c_sub(&ukj, &ukj, &comp_temp);
186
dense[lsub[krep_ind]] = ukj;
187
dense[lsub[krep_ind-1]] = ukj1;
188
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
193
cc_mult(&comp_temp, &ukj, &lusup[luptr]);
194
cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
195
c_add(&comp_temp, &comp_temp, &comp_temp1);
196
cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
197
c_add(&comp_temp, &comp_temp, &comp_temp1);
198
c_sub(&dense[irow], &dense[irow], &comp_temp);
205
* Case: sup-col update
206
* Perform a triangular solve and block update,
207
* then scatter the result of sup-col update to dense
210
no_zeros = kfnz - fst_col;
212
/* Copy U[*,j] segment from dense[*] to tempv[*] */
213
isub = lptr + no_zeros;
214
for (i = 0; i < segsze; i++) {
216
tempv[i] = dense[irow];
220
/* Dense triangular solve -- start effective triangle */
221
luptr += nsupr * no_zeros + no_zeros;
223
#ifdef USE_VENDOR_BLAS
225
CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
226
&nsupr, tempv, &incx );
228
ctrsv_( "L", "N", "U", &segsze, &lusup[luptr],
229
&nsupr, tempv, &incx );
231
luptr += segsze; /* Dense matrix-vector */
232
tempv1 = &tempv[segsze];
236
CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr],
237
&nsupr, tempv, &incx, &beta, tempv1, &incy );
239
cgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr],
240
&nsupr, tempv, &incx, &beta, tempv1, &incy );
243
clsolve ( nsupr, segsze, &lusup[luptr], tempv );
245
luptr += segsze; /* Dense matrix-vector */
246
tempv1 = &tempv[segsze];
247
cmatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1);
251
/* Scatter tempv[] into SPA dense[] as a temporary storage */
252
isub = lptr + no_zeros;
253
for (i = 0; i < segsze; i++) {
255
dense[irow] = tempv[i];
260
/* Scatter tempv1[] into SPA dense[] */
261
for (i = 0; i < nrow; i++) {
263
c_sub(&dense[irow], &dense[irow], &tempv1[i]);
269
} /* if jsupno ... */
271
} /* for each segment... */
274
* Process the supernodal portion of L\U[*,j]
276
nextlu = xlusup[jcol];
277
fsupc = xsup[jsupno];
279
/* Copy the SPA dense into L\U[*,j] */
280
new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc];
281
while ( new_next > nzlumax ) {
282
if (mem_error = cLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu))
288
for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) {
290
lusup[nextlu] = dense[irow];
295
xlusup[jcolp1] = nextlu; /* Close L\U[*,jcol] */
297
/* For more updates within the panel (also within the current supernode),
298
* should start from the first column of the panel, or the first column
299
* of the supernode, whichever is bigger. There are 2 cases:
300
* 1) fsupc < fpanelc, then fst_col := fpanelc
301
* 2) fsupc >= fpanelc, then fst_col := fsupc
303
fst_col = SUPERLU_MAX ( fsupc, fpanelc );
305
if ( fst_col < jcol ) {
307
/* Distance between the current supernode and the current panel.
308
d_fsupc=0 if fsupc >= fpanelc. */
309
d_fsupc = fst_col - fsupc;
311
lptr = xlsub[fsupc] + d_fsupc;
312
luptr = xlusup[fst_col] + d_fsupc;
313
nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */
314
nsupc = jcol - fst_col; /* Excluding jcol */
315
nrow = nsupr - d_fsupc - nsupc;
317
/* Points to the beginning of jcol in snode L\U(jsupno) */
318
ufirst = xlusup[jcol] + d_fsupc;
320
ops[TRSV] += 4 * nsupc * (nsupc - 1);
321
ops[GEMV] += 8 * nrow * nsupc;
323
#ifdef USE_VENDOR_BLAS
325
CTRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr],
326
&nsupr, &lusup[ufirst], &incx );
328
ctrsv_( "L", "N", "U", &nsupc, &lusup[luptr],
329
&nsupr, &lusup[ufirst], &incx );
332
alpha = none; beta = one; /* y := beta*y + alpha*A*x */
335
CGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
336
&lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
338
cgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
339
&lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
342
clsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );
344
cmatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc],
345
&lusup[ufirst], tempv );
347
/* Copy updates from tempv[*] into lusup[*] */
348
isub = ufirst + nsupc;
349
for (i = 0; i < nrow; i++) {
350
c_sub(&lusup[isub], &lusup[isub], &tempv[i]);
358
} /* if fst_col < jcol ... */