3
* -- SuperLU routine (version 3.0) --
4
* Univ. of California Berkeley, Xerox Palo Alto Research Center,
5
* and Lawrence Berkeley National Lab.
10
Copyright (c) 1994 by Xerox Corporation. All rights reserved.
12
THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
13
EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
15
Permission is hereby granted to use or copy this program for any
16
purpose, provided the above notices are retained on all copies.
17
Permission to modify the code and to distribute modified code is
18
granted, provided the above notices are retained, and a notice that
19
the code was modified is included with the above copyright notice.
29
void clsolve(int, int, complex *, complex *);
30
void cmatvec(int, int, int, complex *, complex *, complex *);
31
extern void ccheck_tempv();
35
const int m, /* in - number of rows in the matrix */
37
const int jcol, /* in */
38
const int nseg, /* in */
39
complex *dense, /* out, of size n by w */
40
complex *tempv, /* working array */
42
int *repfnz, /* in, of size n by w */
43
GlobalLU_t *Glu, /* modified */
44
SuperLUStat_t *stat /* output */
51
* Performs numeric block updates (sup-panel) in topological order.
52
* It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
53
* Special processing on the supernodal portion of L\U[*,j]
55
* Before entering this routine, the original nonzeros in the panel
56
* were already copied into the spa[m,w].
58
* Updated/Output parameters-
59
* dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned
60
* collectively in the m-by-w vector dense[*].
64
#ifdef USE_VENDOR_BLAS
66
_fcd ftcs1 = _cptofcd("L", strlen("L")),
67
ftcs2 = _cptofcd("N", strlen("N")),
68
ftcs3 = _cptofcd("U", strlen("U"));
70
int incx = 1, incy = 1;
75
int fsupc, nsupc, nsupr, nrow;
77
complex ukj, ukj1, ukj2;
78
int luptr, luptr1, luptr2;
80
int block_nrow; /* no of rows in a block row */
81
register int lptr; /* Points to the row subscripts of a supernode */
82
int kfnz, irow, no_zeros;
83
register int isub, isub1, i;
84
register int jj; /* Index through each column in the panel */
89
int *repfnz_col; /* repfnz[] for a column in the panel */
90
complex *dense_col; /* dense[] for a column in the panel */
91
complex *tempv1; /* Used in 1-D update */
92
complex *TriTmp, *MatvecTmp; /* used in 2-D update */
93
complex zero = {0.0, 0.0};
94
complex one = {1.0, 0.0};
95
complex comp_temp, comp_temp1;
97
register int r_ind, r_hi;
98
static int first = 1, maxsuper, rowblk, colblk;
99
flops_t *ops = stat->ops;
106
xlusup = Glu->xlusup;
109
maxsuper = sp_ienv(3);
114
ldaTmp = maxsuper + rowblk;
117
* For each nonz supernode segment of U[*,j] in topological order
120
for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */
122
/* krep = representative of current k-th supernode
123
* fsupc = first supernodal column
124
* nsupc = no of columns in a supernode
125
* nsupr = no of rows in a supernode
128
fsupc = xsup[supno[krep]];
129
nsupc = krep - fsupc + 1;
130
nsupr = xlsub[fsupc+1] - xlsub[fsupc];
131
nrow = nsupr - nsupc;
133
krep_ind = lptr + nsupc - 1;
138
if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */
142
/* Sequence through each column in panel -- triangular solves */
143
for (jj = jcol; jj < jcol + w; jj++,
144
repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) {
146
kfnz = repfnz_col[krep];
147
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
149
segsze = krep - kfnz + 1;
150
luptr = xlusup[fsupc];
152
ops[TRSV] += 4 * segsze * (segsze - 1);
153
ops[GEMV] += 8 * nrow * segsze;
155
/* Case 1: Update U-segment of size 1 -- col-col update */
157
ukj = dense_col[lsub[krep_ind]];
158
luptr += nsupr*(nsupc-1) + nsupc;
160
for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
162
cc_mult(&comp_temp, &ukj, &lusup[luptr]);
163
c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
167
} else if ( segsze <= 3 ) {
168
ukj = dense_col[lsub[krep_ind]];
169
ukj1 = dense_col[lsub[krep_ind - 1]];
170
luptr += nsupr*(nsupc-1) + nsupc-1;
171
luptr1 = luptr - nsupr;
174
cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
175
c_sub(&ukj, &ukj, &comp_temp);
176
dense_col[lsub[krep_ind]] = ukj;
177
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
180
cc_mult(&comp_temp, &ukj, &lusup[luptr]);
181
cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
182
c_add(&comp_temp, &comp_temp, &comp_temp1);
183
c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
186
ukj2 = dense_col[lsub[krep_ind - 2]];
187
luptr2 = luptr1 - nsupr;
188
cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
189
c_sub(&ukj1, &ukj1, &comp_temp);
191
cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
192
cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
193
c_add(&comp_temp, &comp_temp, &comp_temp1);
194
c_sub(&ukj, &ukj, &comp_temp);
195
dense_col[lsub[krep_ind]] = ukj;
196
dense_col[lsub[krep_ind-1]] = ukj1;
197
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
199
luptr++; luptr1++; luptr2++;
200
cc_mult(&comp_temp, &ukj, &lusup[luptr]);
201
cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
202
c_add(&comp_temp, &comp_temp, &comp_temp1);
203
cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
204
c_add(&comp_temp, &comp_temp, &comp_temp1);
205
c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
209
} else { /* segsze >= 4 */
211
/* Copy U[*,j] segment from dense[*] to TriTmp[*], which
212
holds the result of triangular solves. */
213
no_zeros = kfnz - fsupc;
214
isub = lptr + no_zeros;
215
for (i = 0; i < segsze; ++i) {
217
TriTmp[i] = dense_col[irow]; /* Gather */
221
/* start effective triangle */
222
luptr += nsupr * no_zeros + no_zeros;
224
#ifdef USE_VENDOR_BLAS
226
CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
227
&nsupr, TriTmp, &incx );
229
ctrsv_( "L", "N", "U", &segsze, &lusup[luptr],
230
&nsupr, TriTmp, &incx );
233
clsolve ( nsupr, segsze, &lusup[luptr], TriTmp );
239
} /* for jj ... end tri-solves */
241
/* Block row updates; push all the way into dense[*] block */
242
for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) {
244
r_hi = SUPERLU_MIN(nrow, r_ind + rowblk);
245
block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind);
246
luptr = xlusup[fsupc] + nsupc + r_ind;
247
isub1 = lptr + nsupc + r_ind;
253
/* Sequence through each column in panel -- matrix-vector */
254
for (jj = jcol; jj < jcol + w; jj++,
255
repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
257
kfnz = repfnz_col[krep];
258
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
260
segsze = krep - kfnz + 1;
261
if ( segsze <= 3 ) continue; /* skip unrolled cases */
263
/* Perform a block update, and scatter the result of
264
matrix-vector to dense[]. */
265
no_zeros = kfnz - fsupc;
266
luptr1 = luptr + nsupr * no_zeros;
267
MatvecTmp = &TriTmp[maxsuper];
269
#ifdef USE_VENDOR_BLAS
273
CGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1],
274
&nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
276
cgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1],
277
&nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
280
cmatvec(nsupr, block_nrow, segsze, &lusup[luptr1],
284
/* Scatter MatvecTmp[*] into SPA dense[*] temporarily
285
* such that MatvecTmp[*] can be re-used for the
286
* the next blok row update. dense[] will be copied into
287
* global store after the whole panel has been finished.
290
for (i = 0; i < block_nrow; i++) {
292
c_sub(&dense_col[irow], &dense_col[irow],
300
} /* for each block row ... */
302
/* Scatter the triangular solves into SPA dense[*] */
307
for (jj = jcol; jj < jcol + w; jj++,
308
repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
309
kfnz = repfnz_col[krep];
310
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
312
segsze = krep - kfnz + 1;
313
if ( segsze <= 3 ) continue; /* skip unrolled cases */
315
no_zeros = kfnz - fsupc;
316
isub = lptr + no_zeros;
317
for (i = 0; i < segsze; i++) {
319
dense_col[irow] = TriTmp[i];
326
} else { /* 1-D block modification */
329
/* Sequence through each column in the panel */
330
for (jj = jcol; jj < jcol + w; jj++,
331
repfnz_col += m, dense_col += m) {
333
kfnz = repfnz_col[krep];
334
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
336
segsze = krep - kfnz + 1;
337
luptr = xlusup[fsupc];
339
ops[TRSV] += 4 * segsze * (segsze - 1);
340
ops[GEMV] += 8 * nrow * segsze;
342
/* Case 1: Update U-segment of size 1 -- col-col update */
344
ukj = dense_col[lsub[krep_ind]];
345
luptr += nsupr*(nsupc-1) + nsupc;
347
for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
349
cc_mult(&comp_temp, &ukj, &lusup[luptr]);
350
c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
354
} else if ( segsze <= 3 ) {
355
ukj = dense_col[lsub[krep_ind]];
356
luptr += nsupr*(nsupc-1) + nsupc-1;
357
ukj1 = dense_col[lsub[krep_ind - 1]];
358
luptr1 = luptr - nsupr;
361
cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
362
c_sub(&ukj, &ukj, &comp_temp);
363
dense_col[lsub[krep_ind]] = ukj;
364
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
367
cc_mult(&comp_temp, &ukj, &lusup[luptr]);
368
cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
369
c_add(&comp_temp, &comp_temp, &comp_temp1);
370
c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
373
ukj2 = dense_col[lsub[krep_ind - 2]];
374
luptr2 = luptr1 - nsupr;
375
cc_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
376
c_sub(&ukj1, &ukj1, &comp_temp);
378
cc_mult(&comp_temp, &ukj1, &lusup[luptr1]);
379
cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
380
c_add(&comp_temp, &comp_temp, &comp_temp1);
381
c_sub(&ukj, &ukj, &comp_temp);
382
dense_col[lsub[krep_ind]] = ukj;
383
dense_col[lsub[krep_ind-1]] = ukj1;
384
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
386
++luptr; ++luptr1; ++luptr2;
387
cc_mult(&comp_temp, &ukj, &lusup[luptr]);
388
cc_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
389
c_add(&comp_temp, &comp_temp, &comp_temp1);
390
cc_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
391
c_add(&comp_temp, &comp_temp, &comp_temp1);
392
c_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
396
} else { /* segsze >= 4 */
398
* Perform a triangular solve and block update,
399
* then scatter the result of sup-col update to dense[].
401
no_zeros = kfnz - fsupc;
403
/* Copy U[*,j] segment from dense[*] to tempv[*]:
404
* The result of triangular solve is in tempv[*];
405
* The result of matrix vector update is in dense_col[*]
407
isub = lptr + no_zeros;
408
for (i = 0; i < segsze; ++i) {
410
tempv[i] = dense_col[irow]; /* Gather */
414
/* start effective triangle */
415
luptr += nsupr * no_zeros + no_zeros;
417
#ifdef USE_VENDOR_BLAS
419
CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
420
&nsupr, tempv, &incx );
422
ctrsv_( "L", "N", "U", &segsze, &lusup[luptr],
423
&nsupr, tempv, &incx );
426
luptr += segsze; /* Dense matrix-vector */
427
tempv1 = &tempv[segsze];
431
CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr],
432
&nsupr, tempv, &incx, &beta, tempv1, &incy );
434
cgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr],
435
&nsupr, tempv, &incx, &beta, tempv1, &incy );
438
clsolve ( nsupr, segsze, &lusup[luptr], tempv );
440
luptr += segsze; /* Dense matrix-vector */
441
tempv1 = &tempv[segsze];
442
cmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1);
445
/* Scatter tempv[*] into SPA dense[*] temporarily, such
446
* that tempv[*] can be used for the triangular solve of
447
* the next column of the panel. They will be copied into
448
* ucol[*] after the whole panel has been finished.
450
isub = lptr + no_zeros;
451
for (i = 0; i < segsze; i++) {
453
dense_col[irow] = tempv[i];
458
/* Scatter the update from tempv1[*] into SPA dense[*] */
459
/* Start dense rectangular L */
460
for (i = 0; i < nrow; i++) {
462
c_sub(&dense_col[irow], &dense_col[irow], &tempv1[i]);
467
} /* else segsze>=4 ... */
469
} /* for each column in the panel... */
471
} /* else 1-D update ... */
473
} /* for each updating supernode ... */