3
* -- SuperLU routine (version 3.0) --
4
* Univ. of California Berkeley, Xerox Palo Alto Research Center,
5
* and Lawrence Berkeley National Lab.
10
Copyright (c) 1994 by Xerox Corporation. All rights reserved.
12
THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
13
EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
15
Permission is hereby granted to use or copy this program for any
16
purpose, provided the above notices are retained on all copies.
17
Permission to modify the code and to distribute modified code is
18
granted, provided the above notices are retained, and a notice that
19
the code was modified is included with the above copyright notice.
29
void susolve(int, int, float*, float*);
30
void slsolve(int, int, float*, float*);
31
void smatvec(int, int, int, float*, float*, float*);
35
/* Return value: 0 - successful return
36
* > 0 - number of bytes allocated when run out of space
40
const int jcol, /* in */
41
const int nseg, /* in */
42
float *dense, /* in */
43
float *tempv, /* working array */
46
int fpanelc, /* in -- first column in the current panel */
47
GlobalLU_t *Glu, /* modified */
48
SuperLUStat_t *stat /* output */
54
* Performs numeric block updates (sup-col) in topological order.
55
* It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
56
* Special processing on the supernodal portion of L\U[*,j]
60
_fcd ftcs1 = _cptofcd("L", strlen("L")),
61
ftcs2 = _cptofcd("N", strlen("N")),
62
ftcs3 = _cptofcd("U", strlen("U"));
65
#ifdef USE_VENDOR_BLAS
66
int incx = 1, incy = 1;
70
/* krep = representative of current k-th supernode
71
* fsupc = first supernodal column
72
* nsupc = no of columns in supernode
73
* nsupr = no of rows in supernode (used as leading dimension)
74
* luptr = location of supernodal LU-block in storage
75
* kfnz = first nonz in the k-th supernodal segment
76
* no_zeros = no of leading zeros in a supernodal U-segment
78
float ukj, ukj1, ukj2;
79
int luptr, luptr1, luptr2;
80
int fsupc, nsupc, nsupr, segsze;
81
int nrow; /* No of rows in the matrix of matrix-vector */
82
int jcolp1, jsupno, k, ksub, krep, krep_ind, ksupno;
83
register int lptr, kfnz, isub, irow, i;
84
register int no_zeros, new_next;
86
int fst_col; /* First column within small LU update */
87
int d_fsupc; /* Distance between the first column of the current
88
panel and the first column of the current snode. */
96
#ifdef USE_VENDOR_BLAS
101
flops_t *ops = stat->ops;
108
xlusup = Glu->xlusup;
109
nzlumax = Glu->nzlumax;
111
jsupno = supno[jcol];
114
* For each nonz supernode segment of U[*,j] in topological order
117
for (ksub = 0; ksub < nseg; ksub++) {
121
ksupno = supno[krep];
122
if ( jsupno != ksupno ) { /* Outside the rectangular supernode */
124
fsupc = xsup[ksupno];
125
fst_col = SUPERLU_MAX ( fsupc, fpanelc );
127
/* Distance from the current supernode to the current panel;
128
d_fsupc=0 if fsupc > fpanelc. */
129
d_fsupc = fst_col - fsupc;
131
luptr = xlusup[fst_col] + d_fsupc;
132
lptr = xlsub[fsupc] + d_fsupc;
135
kfnz = SUPERLU_MAX ( kfnz, fpanelc );
137
segsze = krep - kfnz + 1;
138
nsupc = krep - fst_col + 1;
139
nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */
140
nrow = nsupr - d_fsupc - nsupc;
141
krep_ind = lptr + nsupc - 1;
143
ops[TRSV] += segsze * (segsze - 1);
144
ops[GEMV] += 2 * nrow * segsze;
148
* Case 1: Update U-segment of size 1 -- col-col update
151
ukj = dense[lsub[krep_ind]];
152
luptr += nsupr*(nsupc-1) + nsupc;
154
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
156
dense[irow] -= ukj*lusup[luptr];
160
} else if ( segsze <= 3 ) {
161
ukj = dense[lsub[krep_ind]];
162
luptr += nsupr*(nsupc-1) + nsupc-1;
163
ukj1 = dense[lsub[krep_ind - 1]];
164
luptr1 = luptr - nsupr;
166
if ( segsze == 2 ) { /* Case 2: 2cols-col update */
167
ukj -= ukj1 * lusup[luptr1];
168
dense[lsub[krep_ind]] = ukj;
169
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
173
dense[irow] -= ( ukj*lusup[luptr]
174
+ ukj1*lusup[luptr1] );
176
} else { /* Case 3: 3cols-col update */
177
ukj2 = dense[lsub[krep_ind - 2]];
178
luptr2 = luptr1 - nsupr;
179
ukj1 -= ukj2 * lusup[luptr2-1];
180
ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
181
dense[lsub[krep_ind]] = ukj;
182
dense[lsub[krep_ind-1]] = ukj1;
183
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
188
dense[irow] -= ( ukj*lusup[luptr]
189
+ ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
197
* Case: sup-col update
198
* Perform a triangular solve and block update,
199
* then scatter the result of sup-col update to dense
202
no_zeros = kfnz - fst_col;
204
/* Copy U[*,j] segment from dense[*] to tempv[*] */
205
isub = lptr + no_zeros;
206
for (i = 0; i < segsze; i++) {
208
tempv[i] = dense[irow];
212
/* Dense triangular solve -- start effective triangle */
213
luptr += nsupr * no_zeros + no_zeros;
215
#ifdef USE_VENDOR_BLAS
217
STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
218
&nsupr, tempv, &incx );
220
strsv_( "L", "N", "U", &segsze, &lusup[luptr],
221
&nsupr, tempv, &incx );
223
luptr += segsze; /* Dense matrix-vector */
224
tempv1 = &tempv[segsze];
228
SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr],
229
&nsupr, tempv, &incx, &beta, tempv1, &incy );
231
sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr],
232
&nsupr, tempv, &incx, &beta, tempv1, &incy );
235
slsolve ( nsupr, segsze, &lusup[luptr], tempv );
237
luptr += segsze; /* Dense matrix-vector */
238
tempv1 = &tempv[segsze];
239
smatvec (nsupr, nrow , segsze, &lusup[luptr], tempv, tempv1);
243
/* Scatter tempv[] into SPA dense[] as a temporary storage */
244
isub = lptr + no_zeros;
245
for (i = 0; i < segsze; i++) {
247
dense[irow] = tempv[i];
252
/* Scatter tempv1[] into SPA dense[] */
253
for (i = 0; i < nrow; i++) {
255
dense[irow] -= tempv1[i];
261
} /* if jsupno ... */
263
} /* for each segment... */
266
* Process the supernodal portion of L\U[*,j]
268
nextlu = xlusup[jcol];
269
fsupc = xsup[jsupno];
271
/* Copy the SPA dense into L\U[*,j] */
272
new_next = nextlu + xlsub[fsupc+1] - xlsub[fsupc];
273
while ( new_next > nzlumax ) {
274
if ((mem_error = sLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, Glu)))
280
for (isub = xlsub[fsupc]; isub < xlsub[fsupc+1]; isub++) {
282
lusup[nextlu] = dense[irow];
287
xlusup[jcolp1] = nextlu; /* Close L\U[*,jcol] */
289
/* For more updates within the panel (also within the current supernode),
290
* should start from the first column of the panel, or the first column
291
* of the supernode, whichever is bigger. There are 2 cases:
292
* 1) fsupc < fpanelc, then fst_col := fpanelc
293
* 2) fsupc >= fpanelc, then fst_col := fsupc
295
fst_col = SUPERLU_MAX ( fsupc, fpanelc );
297
if ( fst_col < jcol ) {
299
/* Distance between the current supernode and the current panel.
300
d_fsupc=0 if fsupc >= fpanelc. */
301
d_fsupc = fst_col - fsupc;
303
lptr = xlsub[fsupc] + d_fsupc;
304
luptr = xlusup[fst_col] + d_fsupc;
305
nsupr = xlsub[fsupc+1] - xlsub[fsupc]; /* Leading dimension */
306
nsupc = jcol - fst_col; /* Excluding jcol */
307
nrow = nsupr - d_fsupc - nsupc;
309
/* Points to the beginning of jcol in snode L\U(jsupno) */
310
ufirst = xlusup[jcol] + d_fsupc;
312
ops[TRSV] += nsupc * (nsupc - 1);
313
ops[GEMV] += 2 * nrow * nsupc;
315
#ifdef USE_VENDOR_BLAS
317
STRSV( ftcs1, ftcs2, ftcs3, &nsupc, &lusup[luptr],
318
&nsupr, &lusup[ufirst], &incx );
320
strsv_( "L", "N", "U", &nsupc, &lusup[luptr],
321
&nsupr, &lusup[ufirst], &incx );
324
alpha = none; beta = one; /* y := beta*y + alpha*A*x */
327
SGEMV( ftcs2, &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
328
&lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
330
sgemv_( "N", &nrow, &nsupc, &alpha, &lusup[luptr+nsupc], &nsupr,
331
&lusup[ufirst], &incx, &beta, &lusup[ufirst+nsupc], &incy );
334
slsolve ( nsupr, nsupc, &lusup[luptr], &lusup[ufirst] );
336
smatvec ( nsupr, nrow, nsupc, &lusup[luptr+nsupc],
337
&lusup[ufirst], tempv );
339
/* Copy updates from tempv[*] into lusup[*] */
340
isub = ufirst + nsupc;
341
for (i = 0; i < nrow; i++) {
342
lusup[isub] -= tempv[i];
350
} /* if fst_col < jcol ... */