4
* -- SuperLU routine (version 2.0) --
5
* Univ. of California Berkeley, Xerox Palo Alto Research Center,
6
* and Lawrence Berkeley National Lab.
11
Copyright (c) 1994 by Xerox Corporation. All rights reserved.
13
THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
14
EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
16
Permission is hereby granted to use or copy this program for any
17
purpose, provided the above notices are retained on all copies.
18
Permission to modify the code and to distribute modified code is
19
granted, provided the above notices are retained, and a notice that
20
the code was modified is included with the above copyright notice.
31
void dlsolve(int, int, double *, double *);
32
void dmatvec(int, int, int, double *, double *, double *);
33
extern void dcheck_tempv();
37
const int m, /* in - number of rows in the matrix */
39
const int jcol, /* in */
40
const int nseg, /* in */
41
double *dense, /* out, of size n by w */
42
double *tempv, /* working array */
44
int *repfnz, /* in, of size n by w */
45
GlobalLU_t *Glu /* modified */
52
* Performs numeric block updates (sup-panel) in topological order.
53
* It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
54
* Special processing on the supernodal portion of L\U[*,j]
56
* Before entering this routine, the original nonzeros in the panel
57
* were already copied into the spa[m,w].
59
* Updated/Output parameters-
60
* dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned
61
* collectively in the m-by-w vector dense[*].
65
#ifdef USE_VENDOR_BLAS
67
_fcd ftcs1 = _cptofcd("L", strlen("L")),
68
ftcs2 = _cptofcd("N", strlen("N")),
69
ftcs3 = _cptofcd("U", strlen("U"));
71
int incx = 1, incy = 1;
76
int fsupc, nsupc, nsupr, nrow;
78
double ukj, ukj1, ukj2;
79
int luptr, luptr1, luptr2;
81
int block_nrow; /* no of rows in a block row */
82
register int lptr; /* Points to the row subscripts of a supernode */
83
int kfnz, irow, no_zeros;
84
register int isub, isub1, i;
85
register int jj; /* Index through each column in the panel */
90
int *repfnz_col; /* repfnz[] for a column in the panel */
91
double *dense_col; /* dense[] for a column in the panel */
92
double *tempv1; /* Used in 1-D update */
93
double *TriTmp, *MatvecTmp; /* used in 2-D update */
97
register int r_ind, r_hi;
98
static int first = 1, maxsuper, rowblk, colblk;
99
extern SuperLUStat_t SuperLUStat;
100
flops_t *ops = SuperLUStat.ops;
107
xlusup = Glu->xlusup;
110
maxsuper = sp_ienv(3);
115
ldaTmp = maxsuper + rowblk;
118
* For each nonz supernode segment of U[*,j] in topological order
121
for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */
123
/* krep = representative of current k-th supernode
124
* fsupc = first supernodal column
125
* nsupc = no of columns in a supernode
126
* nsupr = no of rows in a supernode
129
fsupc = xsup[supno[krep]];
130
nsupc = krep - fsupc + 1;
131
nsupr = xlsub[fsupc+1] - xlsub[fsupc];
132
nrow = nsupr - nsupc;
134
krep_ind = lptr + nsupc - 1;
139
if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */
143
/* Sequence through each column in panel -- triangular solves */
144
for (jj = jcol; jj < jcol + w; jj++,
145
repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) {
147
kfnz = repfnz_col[krep];
148
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
150
segsze = krep - kfnz + 1;
151
luptr = xlusup[fsupc];
153
ops[TRSV] += segsze * (segsze - 1);
154
ops[GEMV] += 2 * nrow * segsze;
156
/* Case 1: Update U-segment of size 1 -- col-col update */
158
ukj = dense_col[lsub[krep_ind]];
159
luptr += nsupr*(nsupc-1) + nsupc;
161
for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
163
dense_col[irow] -= ukj * lusup[luptr];
167
} else if ( segsze <= 3 ) {
168
ukj = dense_col[lsub[krep_ind]];
169
ukj1 = dense_col[lsub[krep_ind - 1]];
170
luptr += nsupr*(nsupc-1) + nsupc-1;
171
luptr1 = luptr - nsupr;
174
ukj -= ukj1 * lusup[luptr1];
175
dense_col[lsub[krep_ind]] = ukj;
176
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
179
dense_col[irow] -= (ukj*lusup[luptr]
180
+ ukj1*lusup[luptr1]);
183
ukj2 = dense_col[lsub[krep_ind - 2]];
184
luptr2 = luptr1 - nsupr;
185
ukj1 -= ukj2 * lusup[luptr2-1];
186
ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
187
dense_col[lsub[krep_ind]] = ukj;
188
dense_col[lsub[krep_ind-1]] = ukj1;
189
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
191
luptr++; luptr1++; luptr2++;
192
dense_col[irow] -= ( ukj*lusup[luptr]
193
+ ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
197
} else { /* segsze >= 4 */
199
/* Copy U[*,j] segment from dense[*] to TriTmp[*], which
200
holds the result of triangular solves. */
201
no_zeros = kfnz - fsupc;
202
isub = lptr + no_zeros;
203
for (i = 0; i < segsze; ++i) {
205
TriTmp[i] = dense_col[irow]; /* Gather */
209
/* start effective triangle */
210
luptr += nsupr * no_zeros + no_zeros;
212
#ifdef USE_VENDOR_BLAS
214
STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
215
&nsupr, TriTmp, &incx );
217
dtrsv_( "L", "N", "U", &segsze, &lusup[luptr],
218
&nsupr, TriTmp, &incx );
221
dlsolve ( nsupr, segsze, &lusup[luptr], TriTmp );
227
} /* for jj ... end tri-solves */
229
/* Block row updates; push all the way into dense[*] block */
230
for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) {
232
r_hi = SUPERLU_MIN(nrow, r_ind + rowblk);
233
block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind);
234
luptr = xlusup[fsupc] + nsupc + r_ind;
235
isub1 = lptr + nsupc + r_ind;
241
/* Sequence through each column in panel -- matrix-vector */
242
for (jj = jcol; jj < jcol + w; jj++,
243
repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
245
kfnz = repfnz_col[krep];
246
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
248
segsze = krep - kfnz + 1;
249
if ( segsze <= 3 ) continue; /* skip unrolled cases */
251
/* Perform a block update, and scatter the result of
252
matrix-vector to dense[]. */
253
no_zeros = kfnz - fsupc;
254
luptr1 = luptr + nsupr * no_zeros;
255
MatvecTmp = &TriTmp[maxsuper];
257
#ifdef USE_VENDOR_BLAS
261
SGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1],
262
&nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
264
dgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1],
265
&nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
268
dmatvec(nsupr, block_nrow, segsze, &lusup[luptr1],
272
/* Scatter MatvecTmp[*] into SPA dense[*] temporarily
273
* such that MatvecTmp[*] can be re-used for the
274
* the next blok row update. dense[] will be copied into
275
* global store after the whole panel has been finished.
278
for (i = 0; i < block_nrow; i++) {
280
dense_col[irow] -= MatvecTmp[i];
287
} /* for each block row ... */
289
/* Scatter the triangular solves into SPA dense[*] */
294
for (jj = jcol; jj < jcol + w; jj++,
295
repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
296
kfnz = repfnz_col[krep];
297
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
299
segsze = krep - kfnz + 1;
300
if ( segsze <= 3 ) continue; /* skip unrolled cases */
302
no_zeros = kfnz - fsupc;
303
isub = lptr + no_zeros;
304
for (i = 0; i < segsze; i++) {
306
dense_col[irow] = TriTmp[i];
313
} else { /* 1-D block modification */
316
/* Sequence through each column in the panel */
317
for (jj = jcol; jj < jcol + w; jj++,
318
repfnz_col += m, dense_col += m) {
320
kfnz = repfnz_col[krep];
321
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
323
segsze = krep - kfnz + 1;
324
luptr = xlusup[fsupc];
326
ops[TRSV] += segsze * (segsze - 1);
327
ops[GEMV] += 2 * nrow * segsze;
329
/* Case 1: Update U-segment of size 1 -- col-col update */
331
ukj = dense_col[lsub[krep_ind]];
332
luptr += nsupr*(nsupc-1) + nsupc;
334
for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
336
dense_col[irow] -= ukj * lusup[luptr];
340
} else if ( segsze <= 3 ) {
341
ukj = dense_col[lsub[krep_ind]];
342
luptr += nsupr*(nsupc-1) + nsupc-1;
343
ukj1 = dense_col[lsub[krep_ind - 1]];
344
luptr1 = luptr - nsupr;
347
ukj -= ukj1 * lusup[luptr1];
348
dense_col[lsub[krep_ind]] = ukj;
349
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
352
dense_col[irow] -= (ukj*lusup[luptr]
353
+ ukj1*lusup[luptr1]);
356
ukj2 = dense_col[lsub[krep_ind - 2]];
357
luptr2 = luptr1 - nsupr;
358
ukj1 -= ukj2 * lusup[luptr2-1];
359
ukj = ukj - ukj1*lusup[luptr1] - ukj2*lusup[luptr2];
360
dense_col[lsub[krep_ind]] = ukj;
361
dense_col[lsub[krep_ind-1]] = ukj1;
362
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
364
++luptr; ++luptr1; ++luptr2;
365
dense_col[irow] -= ( ukj*lusup[luptr]
366
+ ukj1*lusup[luptr1] + ukj2*lusup[luptr2] );
370
} else { /* segsze >= 4 */
372
* Perform a triangular solve and block update,
373
* then scatter the result of sup-col update to dense[].
375
no_zeros = kfnz - fsupc;
377
/* Copy U[*,j] segment from dense[*] to tempv[*]:
378
* The result of triangular solve is in tempv[*];
379
* The result of matrix vector update is in dense_col[*]
381
isub = lptr + no_zeros;
382
for (i = 0; i < segsze; ++i) {
384
tempv[i] = dense_col[irow]; /* Gather */
388
/* start effective triangle */
389
luptr += nsupr * no_zeros + no_zeros;
391
#ifdef USE_VENDOR_BLAS
393
STRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
394
&nsupr, tempv, &incx );
396
dtrsv_( "L", "N", "U", &segsze, &lusup[luptr],
397
&nsupr, tempv, &incx );
400
luptr += segsze; /* Dense matrix-vector */
401
tempv1 = &tempv[segsze];
405
SGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr],
406
&nsupr, tempv, &incx, &beta, tempv1, &incy );
408
dgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr],
409
&nsupr, tempv, &incx, &beta, tempv1, &incy );
412
dlsolve ( nsupr, segsze, &lusup[luptr], tempv );
414
luptr += segsze; /* Dense matrix-vector */
415
tempv1 = &tempv[segsze];
416
dmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1);
419
/* Scatter tempv[*] into SPA dense[*] temporarily, such
420
* that tempv[*] can be used for the triangular solve of
421
* the next column of the panel. They will be copied into
422
* ucol[*] after the whole panel has been finished.
424
isub = lptr + no_zeros;
425
for (i = 0; i < segsze; i++) {
427
dense_col[irow] = tempv[i];
432
/* Scatter the update from tempv1[*] into SPA dense[*] */
433
/* Start dense rectangular L */
434
for (i = 0; i < nrow; i++) {
436
dense_col[irow] -= tempv1[i];
441
} /* else segsze>=4 ... */
443
} /* for each column in the panel... */
445
} /* else 1-D update ... */
447
} /* for each updating supernode ... */