3
* -- SuperLU routine (version 3.0) --
4
* Univ. of California Berkeley, Xerox Palo Alto Research Center,
5
* and Lawrence Berkeley National Lab.
10
Copyright (c) 1994 by Xerox Corporation. All rights reserved.
12
THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
13
EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
15
Permission is hereby granted to use or copy this program for any
16
purpose, provided the above notices are retained on all copies.
17
Permission to modify the code and to distribute modified code is
18
granted, provided the above notices are retained, and a notice that
19
the code was modified is included with the above copyright notice.
29
void slsolve(int, int, float *, float *);
30
void smatvec(int, int, int, float *, float *, float *);
31
extern void scheck_tempv();
35
const int m, /* in - number of rows in the matrix */
37
const int jcol, /* in */
38
const int nseg, /* in */
39
float *dense, /* out, of size n by w */
40
float *tempv, /* working array */
42
int *repfnz, /* in, of size n by w */
43
GlobalLU_t *Glu, /* modified */
44
SuperLUStat_t *stat /* output */
51
* Performs numeric block updates (sup-panel) in topological order.
52
* It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
53
* Special processing on the supernodal portion of L\U[*,j]
55
* Before entering this routine, the original nonzeros in the panel
56
* were already copied into the spa[m,w].
58
* Updated/Output parameters-
59
* dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned
60
* collectively in the m-by-w vector dense[*].
64
#ifdef USE_VENDOR_BLAS
66
_fcd ftcs1 = _cptofcd("L", strlen("L")),
67
ftcs2 = _cptofcd("N", strlen("N")),
68
ftcs3 = _cptofcd("U", strlen("U"));
70
int incx = 1, incy = 1;
75
int fsupc, nsupc, nsupr, nrow;
77
float ukj, ukj1, ukj2;
78
int luptr, luptr1, luptr2;
80
int block_nrow; /* no of rows in a block row */
81
register int lptr; /* Points to the row subscripts of a supernode */
82
int kfnz, irow, no_zeros;
83
register int isub, isub1, i;
84
register int jj; /* Index through each column in the panel */
89
int *repfnz_col; /* repfnz[] for a column in the panel */
90
float *dense_col; /* dense[] for a column in the panel */
91
float *tempv1; /* Used in 1-D update */
92
float *TriTmp, *MatvecTmp; /* used in 2-D update */
96
register int r_ind, r_hi;
97
static int first = 1, maxsuper, rowblk, colblk;
98
flops_t *ops = stat->ops;
105
xlusup = Glu->xlusup;
108
maxsuper = sp_ienv(3);
113
ldaTmp = maxsuper + rowblk;
116
* For each nonz supernode segment of U[*,j] in topological order
119
for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */
121
/* krep = representative of current k-th supernode
122
* fsupc = first supernodal column
123
* nsupc = no of columns in a supernode
124
* nsupr = no of rows in a supernode
127
fsupc = xsup[supno[krep]];
128
nsupc = krep - fsupc + 1;
129
nsupr = xlsub[fsupc+1] - xlsub[fsupc];
130
nrow = nsupr - nsupc;
132
krep_ind = lptr + nsupc - 1;
137
if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */
141
/* Sequence through each column in panel -- triangular solves */
142
for (jj = jcol; jj < jcol + w; jj++,
143
repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) {
145
kfnz = repfnz_col[krep];
146
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
148
segsze = krep - kfnz + 1;
149
luptr = xlusup[fsupc];
151
ops[TRSV] += segsze * (segsze - 1);
152
ops[GEMV] += 2 * nrow * segsze;
154
/* Case 1: Update U-segment of size 1 -- col-col update */
156
ukj = dense_col[lsub[krep_ind]];
157
luptr += nsupr*(nsupc-1) + nsupc;
159
for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
161
dense_col[irow] -= ukj * lusup[luptr];
165
} else if ( segsze <= 3 ) {
166
ukj = dense_col[lsub[krep_ind]];
167
ukj1 = dense_col[lsub[krep_ind - 1]];
168
luptr += nsupr*(nsupc-1) + nsupc-1;
169
luptr1 = luptr - nsupr;
172
ukj -= ukj1 * lusup[luptr1];
173
dense_col[lsub[krep_ind]] = ukj;
174
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
177
dense_col[irow] -= (ukj*lusup[luptr]
178
+ ukj1*lusup[luptr1]);
181
ukj2 = dense_col[lsub[krep_ind - 2]];
182
luptr2 = luptr1 - nsupr;
183
ukj1 -= ukj2 * lusup[luptr2-1];
184
ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
185
dense_col[lsub[krep_ind]] = ukj;
186
dense_col[lsub[krep_ind-1]] = ukj1;
187
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
189
luptr++; luptr1++; luptr2++;
190
dense_col[irow] -= ( ukj*lusup[luptr]
191
+ ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
195
} else { /* segsze >= 4 */
197
/* Copy U[*,j] segment from dense[*] to TriTmp[*], which
198
holds the result of triangular solves. */
199
no_zeros = kfnz - fsupc;
200
isub = lptr + no_zeros;
201
for (i = 0; i < segsze; ++i) {
203
TriTmp[i] = dense_col[irow]; /* Gather */
207
/* start effective triangle */
208
luptr += nsupr * no_zeros + no_zeros;
210
#ifdef USE_VENDOR_BLAS
212
STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
213
&nsupr, TriTmp, &incx );
215
strsv_( "L", "N", "U", &segsze, &lusup[luptr],
216
&nsupr, TriTmp, &incx );
219
slsolve ( nsupr, segsze, &lusup[luptr], TriTmp );
225
} /* for jj ... end tri-solves */
227
/* Block row updates; push all the way into dense[*] block */
228
for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) {
230
r_hi = SUPERLU_MIN(nrow, r_ind + rowblk);
231
block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind);
232
luptr = xlusup[fsupc] + nsupc + r_ind;
233
isub1 = lptr + nsupc + r_ind;
239
/* Sequence through each column in panel -- matrix-vector */
240
for (jj = jcol; jj < jcol + w; jj++,
241
repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
243
kfnz = repfnz_col[krep];
244
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
246
segsze = krep - kfnz + 1;
247
if ( segsze <= 3 ) continue; /* skip unrolled cases */
249
/* Perform a block update, and scatter the result of
250
matrix-vector to dense[]. */
251
no_zeros = kfnz - fsupc;
252
luptr1 = luptr + nsupr * no_zeros;
253
MatvecTmp = &TriTmp[maxsuper];
255
#ifdef USE_VENDOR_BLAS
259
SGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1],
260
&nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
262
sgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1],
263
&nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
266
smatvec(nsupr, block_nrow, segsze, &lusup[luptr1],
270
/* Scatter MatvecTmp[*] into SPA dense[*] temporarily
271
* such that MatvecTmp[*] can be re-used for the
272
* the next blok row update. dense[] will be copied into
273
* global store after the whole panel has been finished.
276
for (i = 0; i < block_nrow; i++) {
278
dense_col[irow] -= MatvecTmp[i];
285
} /* for each block row ... */
287
/* Scatter the triangular solves into SPA dense[*] */
292
for (jj = jcol; jj < jcol + w; jj++,
293
repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
294
kfnz = repfnz_col[krep];
295
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
297
segsze = krep - kfnz + 1;
298
if ( segsze <= 3 ) continue; /* skip unrolled cases */
300
no_zeros = kfnz - fsupc;
301
isub = lptr + no_zeros;
302
for (i = 0; i < segsze; i++) {
304
dense_col[irow] = TriTmp[i];
311
} else { /* 1-D block modification */
314
/* Sequence through each column in the panel */
315
for (jj = jcol; jj < jcol + w; jj++,
316
repfnz_col += m, dense_col += m) {
318
kfnz = repfnz_col[krep];
319
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
321
segsze = krep - kfnz + 1;
322
luptr = xlusup[fsupc];
324
ops[TRSV] += segsze * (segsze - 1);
325
ops[GEMV] += 2 * nrow * segsze;
327
/* Case 1: Update U-segment of size 1 -- col-col update */
329
ukj = dense_col[lsub[krep_ind]];
330
luptr += nsupr*(nsupc-1) + nsupc;
332
for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
334
dense_col[irow] -= ukj * lusup[luptr];
338
} else if ( segsze <= 3 ) {
339
ukj = dense_col[lsub[krep_ind]];
340
luptr += nsupr*(nsupc-1) + nsupc-1;
341
ukj1 = dense_col[lsub[krep_ind - 1]];
342
luptr1 = luptr - nsupr;
345
ukj -= ukj1 * lusup[luptr1];
346
dense_col[lsub[krep_ind]] = ukj;
347
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
350
dense_col[irow] -= (ukj*lusup[luptr]
351
+ ukj1*lusup[luptr1]);
354
ukj2 = dense_col[lsub[krep_ind - 2]];
355
luptr2 = luptr1 - nsupr;
356
ukj1 -= ukj2 * lusup[luptr2-1];
357
ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
358
dense_col[lsub[krep_ind]] = ukj;
359
dense_col[lsub[krep_ind-1]] = ukj1;
360
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
362
++luptr; ++luptr1; ++luptr2;
363
dense_col[irow] -= ( ukj*lusup[luptr]
364
+ ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
368
} else { /* segsze >= 4 */
370
* Perform a triangular solve and block update,
371
* then scatter the result of sup-col update to dense[].
373
no_zeros = kfnz - fsupc;
375
/* Copy U[*,j] segment from dense[*] to tempv[*]:
376
* The result of triangular solve is in tempv[*];
377
* The result of matrix vector update is in dense_col[*]
379
isub = lptr + no_zeros;
380
for (i = 0; i < segsze; ++i) {
382
tempv[i] = dense_col[irow]; /* Gather */
386
/* start effective triangle */
387
luptr += nsupr * no_zeros + no_zeros;
389
#ifdef USE_VENDOR_BLAS
391
STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
392
&nsupr, tempv, &incx );
394
strsv_( "L", "N", "U", &segsze, &lusup[luptr],
395
&nsupr, tempv, &incx );
398
luptr += segsze; /* Dense matrix-vector */
399
tempv1 = &tempv[segsze];
403
SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr],
404
&nsupr, tempv, &incx, &beta, tempv1, &incy );
406
sgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr],
407
&nsupr, tempv, &incx, &beta, tempv1, &incy );
410
slsolve ( nsupr, segsze, &lusup[luptr], tempv );
412
luptr += segsze; /* Dense matrix-vector */
413
tempv1 = &tempv[segsze];
414
smatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1);
417
/* Scatter tempv[*] into SPA dense[*] temporarily, such
418
* that tempv[*] can be used for the triangular solve of
419
* the next column of the panel. They will be copied into
420
* ucol[*] after the whole panel has been finished.
422
isub = lptr + no_zeros;
423
for (i = 0; i < segsze; i++) {
425
dense_col[irow] = tempv[i];
430
/* Scatter the update from tempv1[*] into SPA dense[*] */
431
/* Start dense rectangular L */
432
for (i = 0; i < nrow; i++) {
434
dense_col[irow] -= tempv1[i];
439
} /* else segsze>=4 ... */
441
} /* for each column in the panel... */
443
} /* else 1-D update ... */
445
} /* for each updating supernode ... */