3
* -- SuperLU routine (version 3.0) --
4
* Univ. of California Berkeley, Xerox Palo Alto Research Center,
5
* and Lawrence Berkeley National Lab.
10
Copyright (c) 1994 by Xerox Corporation. All rights reserved.
12
THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
13
EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
15
Permission is hereby granted to use or copy this program for any
16
purpose, provided the above notices are retained on all copies.
17
Permission to modify the code and to distribute modified code is
18
granted, provided the above notices are retained, and a notice that
19
the code was modified is included with the above copyright notice.
29
void zlsolve(int, int, doublecomplex *, doublecomplex *);
30
void zmatvec(int, int, int, doublecomplex *, doublecomplex *, doublecomplex *);
31
extern void zcheck_tempv();
35
const int m, /* in - number of rows in the matrix */
37
const int jcol, /* in */
38
const int nseg, /* in */
39
doublecomplex *dense, /* out, of size n by w */
40
doublecomplex *tempv, /* working array */
42
int *repfnz, /* in, of size n by w */
43
GlobalLU_t *Glu, /* modified */
44
SuperLUStat_t *stat /* output */
51
* Performs numeric block updates (sup-panel) in topological order.
52
* It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
53
* Special processing on the supernodal portion of L\U[*,j]
55
* Before entering this routine, the original nonzeros in the panel
56
* were already copied into the spa[m,w].
58
* Updated/Output parameters-
59
* dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned
60
* collectively in the m-by-w vector dense[*].
64
#ifdef USE_VENDOR_BLAS
66
_fcd ftcs1 = _cptofcd("L", strlen("L")),
67
ftcs2 = _cptofcd("N", strlen("N")),
68
ftcs3 = _cptofcd("U", strlen("U"));
70
int incx = 1, incy = 1;
71
doublecomplex alpha, beta;
75
int fsupc, nsupc, nsupr, nrow;
77
doublecomplex ukj, ukj1, ukj2;
78
int luptr, luptr1, luptr2;
80
int block_nrow; /* no of rows in a block row */
81
register int lptr; /* Points to the row subscripts of a supernode */
82
int kfnz, irow, no_zeros;
83
register int isub, isub1, i;
84
register int jj; /* Index through each column in the panel */
89
int *repfnz_col; /* repfnz[] for a column in the panel */
90
doublecomplex *dense_col; /* dense[] for a column in the panel */
91
doublecomplex *tempv1; /* Used in 1-D update */
92
doublecomplex *TriTmp, *MatvecTmp; /* used in 2-D update */
93
doublecomplex zero = {0.0, 0.0};
94
doublecomplex one = {1.0, 0.0};
95
doublecomplex comp_temp, comp_temp1;
97
register int r_ind, r_hi;
98
static int first = 1, maxsuper, rowblk, colblk;
99
flops_t *ops = stat->ops;
106
xlusup = Glu->xlusup;
109
maxsuper = sp_ienv(3);
114
ldaTmp = maxsuper + rowblk;
117
* For each nonz supernode segment of U[*,j] in topological order
120
for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */
122
/* krep = representative of current k-th supernode
123
* fsupc = first supernodal column
124
* nsupc = no of columns in a supernode
125
* nsupr = no of rows in a supernode
128
fsupc = xsup[supno[krep]];
129
nsupc = krep - fsupc + 1;
130
nsupr = xlsub[fsupc+1] - xlsub[fsupc];
131
nrow = nsupr - nsupc;
133
krep_ind = lptr + nsupc - 1;
138
if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */
142
/* Sequence through each column in panel -- triangular solves */
143
for (jj = jcol; jj < jcol + w; jj++,
144
repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) {
146
kfnz = repfnz_col[krep];
147
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
149
segsze = krep - kfnz + 1;
150
luptr = xlusup[fsupc];
152
ops[TRSV] += 4 * segsze * (segsze - 1);
153
ops[GEMV] += 8 * nrow * segsze;
155
/* Case 1: Update U-segment of size 1 -- col-col update */
157
ukj = dense_col[lsub[krep_ind]];
158
luptr += nsupr*(nsupc-1) + nsupc;
160
for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
162
zz_mult(&comp_temp, &ukj, &lusup[luptr]);
163
z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
167
} else if ( segsze <= 3 ) {
168
ukj = dense_col[lsub[krep_ind]];
169
ukj1 = dense_col[lsub[krep_ind - 1]];
170
luptr += nsupr*(nsupc-1) + nsupc-1;
171
luptr1 = luptr - nsupr;
174
zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
175
z_sub(&ukj, &ukj, &comp_temp);
176
dense_col[lsub[krep_ind]] = ukj;
177
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
180
zz_mult(&comp_temp, &ukj, &lusup[luptr]);
181
zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
182
z_add(&comp_temp, &comp_temp, &comp_temp1);
183
z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
186
ukj2 = dense_col[lsub[krep_ind - 2]];
187
luptr2 = luptr1 - nsupr;
188
zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
189
z_sub(&ukj1, &ukj1, &comp_temp);
191
zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
192
zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
193
z_add(&comp_temp, &comp_temp, &comp_temp1);
194
z_sub(&ukj, &ukj, &comp_temp);
195
dense_col[lsub[krep_ind]] = ukj;
196
dense_col[lsub[krep_ind-1]] = ukj1;
197
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
199
luptr++; luptr1++; luptr2++;
200
zz_mult(&comp_temp, &ukj, &lusup[luptr]);
201
zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
202
z_add(&comp_temp, &comp_temp, &comp_temp1);
203
zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
204
z_add(&comp_temp, &comp_temp, &comp_temp1);
205
z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
209
} else { /* segsze >= 4 */
211
/* Copy U[*,j] segment from dense[*] to TriTmp[*], which
212
holds the result of triangular solves. */
213
no_zeros = kfnz - fsupc;
214
isub = lptr + no_zeros;
215
for (i = 0; i < segsze; ++i) {
217
TriTmp[i] = dense_col[irow]; /* Gather */
221
/* start effective triangle */
222
luptr += nsupr * no_zeros + no_zeros;
224
#ifdef USE_VENDOR_BLAS
226
CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
227
&nsupr, TriTmp, &incx );
229
ztrsv_( "L", "N", "U", &segsze, &lusup[luptr],
230
&nsupr, TriTmp, &incx );
233
zlsolve ( nsupr, segsze, &lusup[luptr], TriTmp );
239
} /* for jj ... end tri-solves */
241
/* Block row updates; push all the way into dense[*] block */
242
for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) {
244
r_hi = SUPERLU_MIN(nrow, r_ind + rowblk);
245
block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind);
246
luptr = xlusup[fsupc] + nsupc + r_ind;
247
isub1 = lptr + nsupc + r_ind;
253
/* Sequence through each column in panel -- matrix-vector */
254
for (jj = jcol; jj < jcol + w; jj++,
255
repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
257
kfnz = repfnz_col[krep];
258
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
260
segsze = krep - kfnz + 1;
261
if ( segsze <= 3 ) continue; /* skip unrolled cases */
263
/* Perform a block update, and scatter the result of
264
matrix-vector to dense[]. */
265
no_zeros = kfnz - fsupc;
266
luptr1 = luptr + nsupr * no_zeros;
267
MatvecTmp = &TriTmp[maxsuper];
269
#ifdef USE_VENDOR_BLAS
273
CGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1],
274
&nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
276
zgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1],
277
&nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
280
zmatvec(nsupr, block_nrow, segsze, &lusup[luptr1],
284
/* Scatter MatvecTmp[*] into SPA dense[*] temporarily
285
* such that MatvecTmp[*] can be re-used for the
286
* the next blok row update. dense[] will be copied into
287
* global store after the whole panel has been finished.
290
for (i = 0; i < block_nrow; i++) {
292
z_sub(&dense_col[irow], &dense_col[irow],
300
} /* for each block row ... */
302
/* Scatter the triangular solves into SPA dense[*] */
307
for (jj = jcol; jj < jcol + w; jj++,
308
repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
309
kfnz = repfnz_col[krep];
310
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
312
segsze = krep - kfnz + 1;
313
if ( segsze <= 3 ) continue; /* skip unrolled cases */
315
no_zeros = kfnz - fsupc;
316
isub = lptr + no_zeros;
317
for (i = 0; i < segsze; i++) {
319
dense_col[irow] = TriTmp[i];
326
} else { /* 1-D block modification */
329
/* Sequence through each column in the panel */
330
for (jj = jcol; jj < jcol + w; jj++,
331
repfnz_col += m, dense_col += m) {
333
kfnz = repfnz_col[krep];
334
if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
336
segsze = krep - kfnz + 1;
337
luptr = xlusup[fsupc];
339
ops[TRSV] += 4 * segsze * (segsze - 1);
340
ops[GEMV] += 8 * nrow * segsze;
342
/* Case 1: Update U-segment of size 1 -- col-col update */
344
ukj = dense_col[lsub[krep_ind]];
345
luptr += nsupr*(nsupc-1) + nsupc;
347
for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
349
zz_mult(&comp_temp, &ukj, &lusup[luptr]);
350
z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
354
} else if ( segsze <= 3 ) {
355
ukj = dense_col[lsub[krep_ind]];
356
luptr += nsupr*(nsupc-1) + nsupc-1;
357
ukj1 = dense_col[lsub[krep_ind - 1]];
358
luptr1 = luptr - nsupr;
361
zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
362
z_sub(&ukj, &ukj, &comp_temp);
363
dense_col[lsub[krep_ind]] = ukj;
364
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
367
zz_mult(&comp_temp, &ukj, &lusup[luptr]);
368
zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
369
z_add(&comp_temp, &comp_temp, &comp_temp1);
370
z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
373
ukj2 = dense_col[lsub[krep_ind - 2]];
374
luptr2 = luptr1 - nsupr;
375
zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
376
z_sub(&ukj1, &ukj1, &comp_temp);
378
zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
379
zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
380
z_add(&comp_temp, &comp_temp, &comp_temp1);
381
z_sub(&ukj, &ukj, &comp_temp);
382
dense_col[lsub[krep_ind]] = ukj;
383
dense_col[lsub[krep_ind-1]] = ukj1;
384
for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
386
++luptr; ++luptr1; ++luptr2;
387
zz_mult(&comp_temp, &ukj, &lusup[luptr]);
388
zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
389
z_add(&comp_temp, &comp_temp, &comp_temp1);
390
zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
391
z_add(&comp_temp, &comp_temp, &comp_temp1);
392
z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
396
} else { /* segsze >= 4 */
398
* Perform a triangular solve and block update,
399
* then scatter the result of sup-col update to dense[].
401
no_zeros = kfnz - fsupc;
403
/* Copy U[*,j] segment from dense[*] to tempv[*]:
404
* The result of triangular solve is in tempv[*];
405
* The result of matrix vector update is in dense_col[*]
407
isub = lptr + no_zeros;
408
for (i = 0; i < segsze; ++i) {
410
tempv[i] = dense_col[irow]; /* Gather */
414
/* start effective triangle */
415
luptr += nsupr * no_zeros + no_zeros;
417
#ifdef USE_VENDOR_BLAS
419
CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr],
420
&nsupr, tempv, &incx );
422
ztrsv_( "L", "N", "U", &segsze, &lusup[luptr],
423
&nsupr, tempv, &incx );
426
luptr += segsze; /* Dense matrix-vector */
427
tempv1 = &tempv[segsze];
431
CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr],
432
&nsupr, tempv, &incx, &beta, tempv1, &incy );
434
zgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr],
435
&nsupr, tempv, &incx, &beta, tempv1, &incy );
438
zlsolve ( nsupr, segsze, &lusup[luptr], tempv );
440
luptr += segsze; /* Dense matrix-vector */
441
tempv1 = &tempv[segsze];
442
zmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1);
445
/* Scatter tempv[*] into SPA dense[*] temporarily, such
446
* that tempv[*] can be used for the triangular solve of
447
* the next column of the panel. They will be copied into
448
* ucol[*] after the whole panel has been finished.
450
isub = lptr + no_zeros;
451
for (i = 0; i < segsze; i++) {
453
dense_col[irow] = tempv[i];
458
/* Scatter the update from tempv1[*] into SPA dense[*] */
459
/* Start dense rectangular L */
460
for (i = 0; i < nrow; i++) {
462
z_sub(&dense_col[irow], &dense_col[irow], &tempv1[i]);
467
} /* else segsze>=4 ... */
469
} /* for each column in the panel... */
471
} /* else 1-D update ... */
473
} /* for each updating supernode ... */