3
* -- SuperLU routine (version 3.0) --
4
* Univ. of California Berkeley, Xerox Palo Alto Research Center,
5
* and Lawrence Berkeley National Lab.
10
Copyright (c) 1994 by Xerox Corporation. All rights reserved.
12
THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
13
EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK.
15
Permission is hereby granted to use or copy this program for any
16
purpose, provided the above notices are retained on all copies.
17
Permission to modify the code and to distribute modified code is
18
granted, provided the above notices are retained, and a notice that
19
the code was modified is included with the above copyright notice.
25
sgstrf (superlu_options_t *options, SuperMatrix *A, float drop_tol,
26
int relax, int panel_size, int *etree, void *work, int lwork,
27
int *perm_c, int *perm_r, SuperMatrix *L, SuperMatrix *U,
28
SuperLUStat_t *stat, int *info)
34
* SGSTRF computes an LU factorization of a general sparse m-by-n
35
* matrix A using partial pivoting with row interchanges.
36
* The factorization has the form
38
* where Pr is a row permutation matrix, L is lower triangular with unit
39
* diagonal elements (lower trapezoidal if A->nrow > A->ncol), and U is upper
40
* triangular (upper trapezoidal if A->nrow < A->ncol).
42
* See supermatrix.h for the definition of 'SuperMatrix' structure.
47
* options (input) superlu_options_t*
48
* The structure defines the input parameters to control
49
* how the LU decomposition will be performed.
51
* A (input) SuperMatrix*
52
* Original matrix A, permuted by columns, of dimension
53
* (A->nrow, A->ncol). The type of A can be:
54
* Stype = SLU_NCP; Dtype = SLU_S; Mtype = SLU_GE.
56
* drop_tol (input) float (NOT IMPLEMENTED)
57
* Drop tolerance parameter. At step j of the Gaussian elimination,
58
* if abs(A_ij)/(max_i abs(A_ij)) < drop_tol, drop entry A_ij.
59
* 0 <= drop_tol <= 1. The default value of drop_tol is 0.
62
* To control degree of relaxing supernodes. If the number
63
* of nodes (columns) in a subtree of the elimination tree is less
64
* than relax, this subtree is considered as one supernode,
65
* regardless of the row structures of those columns.
67
* panel_size (input) int
68
* A panel consists of at most panel_size consecutive columns.
70
* etree (input) int*, dimension (A->ncol)
71
* Elimination tree of A'*A.
72
* Note: etree is a vector of parent pointers for a forest whose
73
* vertices are the integers 0 to A->ncol-1; etree[root]==A->ncol.
74
* On input, the columns of A should be permuted so that the
75
* etree is in a certain postorder.
77
* work (input/output) void*, size (lwork) (in bytes)
78
* User-supplied work space and space for the output data structures.
79
* Not referenced if lwork = 0;
82
* Specifies the size of work array in bytes.
83
* = 0: allocate space internally by system malloc;
84
* > 0: use user-supplied work array of length lwork in bytes,
85
* returns error if space runs out.
86
* = -1: the routine guesses the amount of space needed without
87
* performing the factorization, and returns it in
88
* *info; no other side effects.
90
* perm_c (input) int*, dimension (A->ncol)
91
* Column permutation vector, which defines the
92
* permutation matrix Pc; perm_c[i] = j means column i of A is
93
* in position j in A*Pc.
94
* When searching for diagonal, perm_c[*] is applied to the
95
* row subscripts of A, so that diagonal threshold pivoting
96
* can find the diagonal of A, rather than that of A*Pc.
98
* perm_r (input/output) int*, dimension (A->nrow)
99
* Row permutation vector which defines the permutation matrix Pr,
100
* perm_r[i] = j means row i of A is in position j in Pr*A.
101
* If options->Fact = SamePattern_SameRowPerm, the pivoting routine
102
* will try to use the input perm_r, unless a certain threshold
103
* criterion is violated. In that case, perm_r is overwritten by
104
* a new permutation determined by partial pivoting or diagonal
105
* threshold pivoting.
106
* Otherwise, perm_r is output argument;
108
* L (output) SuperMatrix*
109
* The factor L from the factorization Pr*A=L*U; use compressed row
110
* subscripts storage for supernodes, i.e., L has type:
111
* Stype = SLU_SC, Dtype = SLU_S, Mtype = SLU_TRLU.
113
* U (output) SuperMatrix*
114
* The factor U from the factorization Pr*A*Pc=L*U. Use column-wise
115
* storage scheme, i.e., U has types: Stype = SLU_NC,
116
* Dtype = SLU_S, Mtype = SLU_TRU.
118
* stat (output) SuperLUStat_t*
119
* Record the statistics on runtime and floating-point operation count.
120
* See util.h for the definition of 'SuperLUStat_t'.
123
* = 0: successful exit
124
* < 0: if info = -i, the i-th argument had an illegal value
125
* > 0: if info = i, and i is
126
* <= A->ncol: U(i,i) is exactly zero. The factorization has
127
* been completed, but the factor U is exactly singular,
128
* and division by zero will occur if it is used to solve a
129
* system of equations.
130
* > A->ncol: number of bytes allocated when memory allocation
131
* failure occurred, plus A->ncol. If lwork = -1, it is
132
* the estimated amount of space needed, plus A->ncol.
134
* ======================================================================
136
* Local Working Arrays:
137
* ======================
138
* m = number of rows in the matrix
139
* n = number of columns in the matrix
141
* xprune[0:n-1]: xprune[*] points to locations in subscript
142
* vector lsub[*]. For column i, xprune[i] denotes the point where
143
* structural pruning begins. I.e. only xlsub[i],..,xprune[i]-1 need
144
* to be traversed for symbolic factorization.
146
* marker[0:3*m-1]: marker[i] = j means that node i has been
147
* reached when working on column j.
148
* Storage: relative to original row subscripts
149
* NOTE: There are 3 of them: marker/marker1 are used for panel dfs,
150
* see spanel_dfs.c; marker2 is used for inner-factorization,
153
* parent[0:m-1]: parent vector used during dfs
154
* Storage: relative to new row subscripts
156
* xplore[0:m-1]: xplore[i] gives the location of the next (dfs)
157
* unexplored neighbor of i in lsub[*]
159
* segrep[0:nseg-1]: contains the list of supernodal representatives
160
* in topological order of the dfs. A supernode representative is the
161
* last column of a supernode.
162
* The maximum size of segrep[] is n.
164
* repfnz[0:W*m-1]: for a nonzero segment U[*,j] that ends at a
165
* supernodal representative r, repfnz[r] is the location of the first
166
* nonzero in this segment. It is also used during the dfs: repfnz[r]>0
167
* indicates the supernode r has been explored.
168
* NOTE: There are W of them, each used for one column of a panel.
170
* panel_lsub[0:W*m-1]: temporary for the nonzeros row indices below
171
* the panel diagonal. These are filled in during spanel_dfs(), and are
172
* used later in the inner LU factorization within the panel.
173
* panel_lsub[]/dense[] pair forms the SPA data structure.
174
* NOTE: There are W of them.
176
* dense[0:W*m-1]: sparse accumulating (SPA) vector for intermediate values;
177
* NOTE: there are W of them.
179
* tempv[0:*]: real temporary used for dense numeric kernels;
180
* The size of this array is defined by NUM_TEMPV() in ssp_defs.h.
183
/* Local working arrays */
185
int *iperm_r; /* inverse of perm_r;
186
used when options->Fact == SamePattern_SameRowPerm */
187
int *iperm_c; /* inverse of perm_c */
190
int *segrep, *repfnz, *parent, *xplore;
191
int *panel_lsub; /* dense[]/panel_lsub[] pair forms a w-wide SPA */
194
float *dense, *tempv;
198
int *xa_begin, *xa_end;
200
int *xlsub, *xlusup, *xusub;
202
static GlobalLU_t Glu; /* persistent to facilitate multiple factors. */
205
fact_t fact = options->Fact;
206
double diag_pivot_thresh = options->DiagPivotThresh;
207
int pivrow; /* pivotal row number in the original matrix A */
208
int nseg1; /* no of segments in U-column above panel row jcol */
209
int nseg; /* no of segments in each U-column */
211
register int kcol; /* end column of a relaxed snode */
213
register int i, k, jj, new_next, iinfo;
214
int m, n, min_mn, jsupno, fsupc, nextlu, nextu;
215
int w_def; /* upper bound on panel width */
216
int usepr, iperm_r_allocated = 0;
218
int *panel_histo = stat->panel_histo;
219
flops_t *ops = stat->ops;
224
min_mn = SUPERLU_MIN(m, n);
227
asub = Astore->rowind;
228
xa_begin = Astore->colbeg;
229
xa_end = Astore->colend;
231
/* Allocate storage common to the factor routines */
232
*info = sLUMemInit(fact, work, lwork, m, n, Astore->nnz,
233
panel_size, L, U, &Glu, &iwork, &swork);
242
SetIWork(m, n, panel_size, iwork, &segrep, &parent, &xplore,
243
&repfnz, &panel_lsub, &xprune, &marker);
244
sSetRWork(m, panel_size, swork, &dense, &tempv);
246
usepr = (fact == SamePattern_SameRowPerm);
248
/* Compute the inverse of perm_r */
249
iperm_r = (int *) intMalloc(m);
250
for (k = 0; k < m; ++k) iperm_r[perm_r[k]] = k;
251
iperm_r_allocated = 1;
253
iperm_c = (int *) intMalloc(n);
254
for (k = 0; k < n; ++k) iperm_c[perm_c[k]] = k;
256
/* Identify relaxed snodes */
257
relax_end = (int *) intMalloc(n);
258
if ( options->SymmetricMode == YES ) {
259
heap_relax_snode(n, etree, relax, marker, relax_end);
261
relax_snode(n, etree, relax, marker, relax_end);
264
ifill (perm_r, m, EMPTY);
265
ifill (marker, m * NO_MARKER, EMPTY);
267
xsup[0] = xlsub[0] = xusub[0] = xlusup[0] = 0;
271
* Work on one "panel" at a time. A panel is one of the following:
272
* (a) a relaxed supernode at the bottom of the etree, or
273
* (b) panel_size contiguous columns, defined by the user
275
for (jcol = 0; jcol < min_mn; ) {
277
if ( relax_end[jcol] != EMPTY ) { /* start of a relaxed snode */
278
kcol = relax_end[jcol]; /* end of the relaxed snode */
279
panel_histo[kcol-jcol+1]++;
281
/* --------------------------------------
282
* Factorize the relaxed supernode(jcol:kcol)
283
* -------------------------------------- */
284
/* Determine the union of the row structure of the snode */
285
if ( (*info = ssnode_dfs(jcol, kcol, asub, xa_begin, xa_end,
286
xprune, marker, &Glu)) != 0 )
290
nextlu = xlusup[jcol];
291
jsupno = supno[jcol];
292
fsupc = xsup[jsupno];
293
new_next = nextlu + (xlsub[fsupc+1]-xlsub[fsupc])*(kcol-jcol+1);
294
nzlumax = Glu.nzlumax;
295
while ( new_next > nzlumax ) {
296
if ( (*info = sLUMemXpand(jcol, nextlu, LUSUP, &nzlumax, &Glu)) )
300
for (icol = jcol; icol<= kcol; icol++) {
301
xusub[icol+1] = nextu;
303
/* Scatter into SPA dense[*] */
304
for (k = xa_begin[icol]; k < xa_end[icol]; k++)
305
dense[asub[k]] = a[k];
307
/* Numeric update within the snode */
308
ssnode_bmod(icol, jsupno, fsupc, dense, tempv, &Glu, stat);
310
if ( (*info = spivotL(icol, diag_pivot_thresh, &usepr, perm_r,
311
iperm_r, iperm_c, &pivrow, &Glu, stat)) )
312
if ( iinfo == 0 ) iinfo = *info;
315
sprint_lu_col("[1]: ", icol, pivrow, xprune, &Glu);
322
} else { /* Work on one panel of panel_size columns */
324
/* Adjust panel_size so that a panel won't overlap with the next
328
for (k = jcol + 1; k < SUPERLU_MIN(jcol+panel_size, min_mn); k++)
329
if ( relax_end[k] != EMPTY ) {
330
panel_size = k - jcol;
333
if ( k == min_mn ) panel_size = min_mn - jcol;
334
panel_histo[panel_size]++;
336
/* symbolic factor on a panel of columns */
337
spanel_dfs(m, panel_size, jcol, A, perm_r, &nseg1,
338
dense, panel_lsub, segrep, repfnz, xprune,
339
marker, parent, xplore, &Glu);
341
/* numeric sup-panel updates in topological order */
342
spanel_bmod(m, panel_size, jcol, nseg1, dense,
343
tempv, segrep, repfnz, &Glu, stat);
345
/* Sparse LU within the panel, and below panel diagonal */
346
for ( jj = jcol; jj < jcol + panel_size; jj++) {
347
k = (jj - jcol) * m; /* column index for w-wide arrays */
349
nseg = nseg1; /* Begin after all the panel segments */
351
if ((*info = scolumn_dfs(m, jj, perm_r, &nseg, &panel_lsub[k],
352
segrep, &repfnz[k], xprune, marker,
353
parent, xplore, &Glu)) != 0) return;
355
/* Numeric updates */
356
if ((*info = scolumn_bmod(jj, (nseg - nseg1), &dense[k],
357
tempv, &segrep[nseg1], &repfnz[k],
358
jcol, &Glu, stat)) != 0) return;
360
/* Copy the U-segments to ucol[*] */
361
if ((*info = scopy_to_ucol(jj, nseg, segrep, &repfnz[k],
362
perm_r, &dense[k], &Glu)) != 0)
365
if ( (*info = spivotL(jj, diag_pivot_thresh, &usepr, perm_r,
366
iperm_r, iperm_c, &pivrow, &Glu, stat)) )
367
if ( iinfo == 0 ) iinfo = *info;
369
/* Prune columns (0:jj-1) using column jj */
370
spruneL(jj, perm_r, pivrow, nseg, segrep,
371
&repfnz[k], xprune, &Glu);
373
/* Reset repfnz[] for this column */
374
resetrep_col (nseg, segrep, &repfnz[k]);
377
sprint_lu_col("[2]: ", jj, pivrow, xprune, &Glu);
382
jcol += panel_size; /* Move to the next panel */
392
for (i = 0; i < m; ++i)
393
if ( perm_r[i] == EMPTY ) {
399
countnz(min_mn, xprune, &nnzL, &nnzU, &Glu);
400
fixupL(min_mn, perm_r, &Glu);
402
sLUWorkFree(iwork, swork, &Glu); /* Free work space and compress storage */
404
if ( fact == SamePattern_SameRowPerm ) {
405
/* L and U structures may have changed due to possibly different
406
pivoting, even though the storage is available.
407
There could also be memory expansions, so the array locations
409
((SCformat *)L->Store)->nnz = nnzL;
410
((SCformat *)L->Store)->nsuper = Glu.supno[n];
411
((SCformat *)L->Store)->nzval = Glu.lusup;
412
((SCformat *)L->Store)->nzval_colptr = Glu.xlusup;
413
((SCformat *)L->Store)->rowind = Glu.lsub;
414
((SCformat *)L->Store)->rowind_colptr = Glu.xlsub;
415
((NCformat *)U->Store)->nnz = nnzU;
416
((NCformat *)U->Store)->nzval = Glu.ucol;
417
((NCformat *)U->Store)->rowind = Glu.usub;
418
((NCformat *)U->Store)->colptr = Glu.xusub;
420
sCreate_SuperNode_Matrix(L, A->nrow, A->ncol, nnzL, Glu.lusup,
421
Glu.xlusup, Glu.lsub, Glu.xlsub, Glu.supno,
422
Glu.xsup, SLU_SC, SLU_S, SLU_TRLU);
423
sCreate_CompCol_Matrix(U, min_mn, min_mn, nnzU, Glu.ucol,
424
Glu.usub, Glu.xusub, SLU_NC, SLU_S, SLU_TRU);
427
ops[FACT] += ops[TRSV] + ops[GEMV];
429
if ( iperm_r_allocated ) SUPERLU_FREE (iperm_r);
430
SUPERLU_FREE (iperm_c);
431
SUPERLU_FREE (relax_end);