3
* -- SuperLU routine (version 3.0) --
4
* Univ. of California Berkeley, Xerox Palo Alto Research Center,
5
* and Lawrence Berkeley National Lab.
10
Copyright (c) 1994 by Xerox Corporation. All rights reserved.
12
THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
13
EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
15
Permission is hereby granted to use or copy this program for any
16
purpose, provided the above notices are retained on all copies.
17
Permission to modify the code and to distribute modified code is
18
granted, provided the above notices are retained, and a notice that
19
the code was modified is included with the above copyright notice.
29
void slsolve(int, int, float *, float *);
30
void smatvec(int, int, int, float *, float *, float *);
31
extern void scheck_tempv();
35
const int m, /* in - number of rows in the matrix */
37
const int jcol, /* in */
38
const int nseg, /* in */
39
float *dense, /* out, of size n by w */
40
float *tempv, /* working array */
42
int *repfnz, /* in, of size n by w */
43
GlobalLU_t *Glu, /* modified */
44
SuperLUStat_t *stat /* output */
51
* Performs numeric block updates (sup-panel) in topological order.
52
* It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
53
* Special processing on the supernodal portion of L\U[*,j]
55
* Before entering this routine, the original nonzeros in the panel
56
* were already copied into the spa[m,w].
58
* Updated/Output parameters-
59
* dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned
60
* collectively in the m-by-w vector dense[*].
64
#ifdef USE_VENDOR_BLAS
66
_fcd ftcs1 = _cptofcd("L", strlen("L")),
67
ftcs2 = _cptofcd("N", strlen("N")),
68
ftcs3 = _cptofcd("U", strlen("U"));
70
int incx = 1, incy = 1;
75
int fsupc, nsupc, nsupr, nrow;
77
float ukj, ukj1, ukj2;
78
int luptr, luptr1, luptr2;
80
int block_nrow; /* no of rows in a block row */
81
register int lptr; /* Points to the row subscripts of a supernode */
82
int kfnz, irow, no_zeros;
83
register int isub, isub1, i;
84
register int jj; /* Index through each column in the panel */
89
int *repfnz_col; /* repfnz[] for a column in the panel */
90
float *dense_col; /* dense[] for a column in the panel */
91
float *tempv1; /* Used in 1-D update */
92
float *TriTmp, *MatvecTmp; /* used in 2-D update */
95
register int r_ind, r_hi;
96
static int first = 1, maxsuper, rowblk, colblk;
97
flops_t *ops = stat->ops;
104
xlusup = Glu->xlusup;
107
maxsuper = sp_ienv(3);
112
ldaTmp = maxsuper + rowblk;
115
* For each nonz supernode segment of U[*,j] in topological order
118
for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */
120
/* krep = representative of current k-th supernode
121
* fsupc = first supernodal column
122
* nsupc = no of columns in a supernode
123
* nsupr = no of rows in a supernode
126
fsupc = xsup[supno[krep]];
127
nsupc = krep - fsupc + 1;
128
nsupr = xlsub[fsupc+1] - xlsub[fsupc];
129
nrow = nsupr - nsupc;
131
krep_ind = lptr + nsupc - 1;
136
if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */
140
/* Sequence through each column in panel -- triangular solves */
141
for (jj = jcol; jj < jcol + w; jj++,
142
repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) {
144
kfnz = repfnz_col[krep];
145
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
147
segsze = krep - kfnz + 1;
148
luptr = xlusup[fsupc];
150
ops[TRSV] += segsze * (segsze - 1);
151
ops[GEMV] += 2 * nrow * segsze;
153
/* Case 1: Update U-segment of size 1 -- col-col update */
155
ukj = dense_col[lsub[krep_ind]];
156
luptr += nsupr*(nsupc-1) + nsupc;
158
for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
160
dense_col[irow] -= ukj * lusup[luptr];
164
} else if ( segsze <= 3 ) {
165
ukj = dense_col[lsub[krep_ind]];
166
ukj1 = dense_col[lsub[krep_ind - 1]];
167
luptr += nsupr*(nsupc-1) + nsupc-1;
168
luptr1 = luptr - nsupr;
171
ukj -= ukj1 * lusup[luptr1];
172
dense_col[lsub[krep_ind]] = ukj;
173
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
176
dense_col[irow] -= (ukj*lusup[luptr]
177
+ ukj1*lusup[luptr1]);
180
ukj2 = dense_col[lsub[krep_ind - 2]];
181
luptr2 = luptr1 - nsupr;
182
ukj1 -= ukj2 * lusup[luptr2-1];
183
ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
184
dense_col[lsub[krep_ind]] = ukj;
185
dense_col[lsub[krep_ind-1]] = ukj1;
186
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
188
luptr++; luptr1++; luptr2++;
189
dense_col[irow] -= ( ukj*lusup[luptr]
190
+ ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
194
} else { /* segsze >= 4 */
196
/* Copy U[*,j] segment from dense[*] to TriTmp[*], which
197
holds the result of triangular solves. */
198
no_zeros = kfnz - fsupc;
199
isub = lptr + no_zeros;
200
for (i = 0; i < segsze; ++i) {
202
TriTmp[i] = dense_col[irow]; /* Gather */
206
/* start effective triangle */
207
luptr += nsupr * no_zeros + no_zeros;
209
#ifdef USE_VENDOR_BLAS
211
STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
212
&nsupr, TriTmp, &incx );
214
strsv_( "L", "N", "U", &segsze, &lusup[luptr],
215
&nsupr, TriTmp, &incx );
218
slsolve ( nsupr, segsze, &lusup[luptr], TriTmp );
224
} /* for jj ... end tri-solves */
226
/* Block row updates; push all the way into dense[*] block */
227
for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) {
229
r_hi = SUPERLU_MIN(nrow, r_ind + rowblk);
230
block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind);
231
luptr = xlusup[fsupc] + nsupc + r_ind;
232
isub1 = lptr + nsupc + r_ind;
238
/* Sequence through each column in panel -- matrix-vector */
239
for (jj = jcol; jj < jcol + w; jj++,
240
repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
242
kfnz = repfnz_col[krep];
243
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
245
segsze = krep - kfnz + 1;
246
if ( segsze <= 3 ) continue; /* skip unrolled cases */
248
/* Perform a block update, and scatter the result of
249
matrix-vector to dense[]. */
250
no_zeros = kfnz - fsupc;
251
luptr1 = luptr + nsupr * no_zeros;
252
MatvecTmp = &TriTmp[maxsuper];
254
#ifdef USE_VENDOR_BLAS
258
SGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1],
259
&nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
261
sgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1],
262
&nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
265
smatvec(nsupr, block_nrow, segsze, &lusup[luptr1],
269
/* Scatter MatvecTmp[*] into SPA dense[*] temporarily
270
* such that MatvecTmp[*] can be re-used for the
271
* the next blok row update. dense[] will be copied into
272
* global store after the whole panel has been finished.
275
for (i = 0; i < block_nrow; i++) {
277
dense_col[irow] -= MatvecTmp[i];
284
} /* for each block row ... */
286
/* Scatter the triangular solves into SPA dense[*] */
291
for (jj = jcol; jj < jcol + w; jj++,
292
repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
293
kfnz = repfnz_col[krep];
294
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
296
segsze = krep - kfnz + 1;
297
if ( segsze <= 3 ) continue; /* skip unrolled cases */
299
no_zeros = kfnz - fsupc;
300
isub = lptr + no_zeros;
301
for (i = 0; i < segsze; i++) {
303
dense_col[irow] = TriTmp[i];
310
} else { /* 1-D block modification */
313
/* Sequence through each column in the panel */
314
for (jj = jcol; jj < jcol + w; jj++,
315
repfnz_col += m, dense_col += m) {
317
kfnz = repfnz_col[krep];
318
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
320
segsze = krep - kfnz + 1;
321
luptr = xlusup[fsupc];
323
ops[TRSV] += segsze * (segsze - 1);
324
ops[GEMV] += 2 * nrow * segsze;
326
/* Case 1: Update U-segment of size 1 -- col-col update */
328
ukj = dense_col[lsub[krep_ind]];
329
luptr += nsupr*(nsupc-1) + nsupc;
331
for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
333
dense_col[irow] -= ukj * lusup[luptr];
337
} else if ( segsze <= 3 ) {
338
ukj = dense_col[lsub[krep_ind]];
339
luptr += nsupr*(nsupc-1) + nsupc-1;
340
ukj1 = dense_col[lsub[krep_ind - 1]];
341
luptr1 = luptr - nsupr;
344
ukj -= ukj1 * lusup[luptr1];
345
dense_col[lsub[krep_ind]] = ukj;
346
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
349
dense_col[irow] -= (ukj*lusup[luptr]
350
+ ukj1*lusup[luptr1]);
353
ukj2 = dense_col[lsub[krep_ind - 2]];
354
luptr2 = luptr1 - nsupr;
355
ukj1 -= ukj2 * lusup[luptr2-1];
356
ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
357
dense_col[lsub[krep_ind]] = ukj;
358
dense_col[lsub[krep_ind-1]] = ukj1;
359
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
361
++luptr; ++luptr1; ++luptr2;
362
dense_col[irow] -= ( ukj*lusup[luptr]
363
+ ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
367
} else { /* segsze >= 4 */
369
* Perform a triangular solve and block update,
370
* then scatter the result of sup-col update to dense[].
372
no_zeros = kfnz - fsupc;
374
/* Copy U[*,j] segment from dense[*] to tempv[*]:
375
* The result of triangular solve is in tempv[*];
376
* The result of matrix vector update is in dense_col[*]
378
isub = lptr + no_zeros;
379
for (i = 0; i < segsze; ++i) {
381
tempv[i] = dense_col[irow]; /* Gather */
385
/* start effective triangle */
386
luptr += nsupr * no_zeros + no_zeros;
388
#ifdef USE_VENDOR_BLAS
390
STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
391
&nsupr, tempv, &incx );
393
strsv_( "L", "N", "U", &segsze, &lusup[luptr],
394
&nsupr, tempv, &incx );
397
luptr += segsze; /* Dense matrix-vector */
398
tempv1 = &tempv[segsze];
402
SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr],
403
&nsupr, tempv, &incx, &beta, tempv1, &incy );
405
sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr],
406
&nsupr, tempv, &incx, &beta, tempv1, &incy );
409
slsolve ( nsupr, segsze, &lusup[luptr], tempv );
411
luptr += segsze; /* Dense matrix-vector */
412
tempv1 = &tempv[segsze];
413
smatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1);
416
/* Scatter tempv[*] into SPA dense[*] temporarily, such
417
* that tempv[*] can be used for the triangular solve of
418
* the next column of the panel. They will be copied into
419
* ucol[*] after the whole panel has been finished.
421
isub = lptr + no_zeros;
422
for (i = 0; i < segsze; i++) {
424
dense_col[irow] = tempv[i];
429
/* Scatter the update from tempv1[*] into SPA dense[*] */
430
/* Start dense rectangular L */
431
for (i = 0; i < nrow; i++) {
433
dense_col[irow] -= tempv1[i];
438
} /* else segsze>=4 ... */
440
} /* for each column in the panel... */
442
} /* else 1-D update ... */
444
} /* for each updating supernode ... */