~ubuntu-branches/ubuntu/saucy/python-scipy/saucy

1.2.3 by Ondrej Certik
Import upstream version 0.6.0
1
/*
2
 * -- SuperLU routine (version 3.0) --
3
 * Univ. of California Berkeley, Xerox Palo Alto Research Center,
4
 * and Lawrence Berkeley National Lab.
5
 * October 15, 2003
6
 *
7
 */
8
/*
9
  Copyright (c) 1994 by Xerox Corporation.  All rights reserved.
10
 
11
  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
12
  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
13
 
14
  Permission is hereby granted to use or copy this program for any
15
  purpose, provided the above notices are retained on all copies.
16
  Permission to modify the code and to distribute modified code is
17
  granted, provided the above notices are retained, and a notice that
18
  the code was modified is included with the above copyright notice.
19
*/
20
21
#include <stdio.h>
22
#include <stdlib.h>
23
#include "zsp_defs.h"
24
25
/* 
26
 * Function prototypes 
27
 */
28
void zlsolve(int, int, doublecomplex *, doublecomplex *);
29
void zmatvec(int, int, int, doublecomplex *, doublecomplex *, doublecomplex *);
30
extern void zcheck_tempv();
31
32
void
33
zpanel_bmod (
34
	    const int  m,          /* in - number of rows in the matrix */
35
	    const int  w,          /* in */
36
	    const int  jcol,       /* in */
37
	    const int  nseg,       /* in */
38
	    doublecomplex     *dense,     /* out, of size n by w */
39
	    doublecomplex     *tempv,     /* working array */
40
	    int        *segrep,    /* in */
41
	    int        *repfnz,    /* in, of size n by w */
42
	    GlobalLU_t *Glu,       /* modified */
43
	    SuperLUStat_t *stat    /* output */
44
	    )
45
{
46
/* 
47
 * Purpose
48
 * =======
49
 *
50
 *    Performs numeric block updates (sup-panel) in topological order.
51
 *    It features: col-col, 2cols-col, 3cols-col, and sup-col updates.
52
 *    Special processing on the supernodal portion of L\U[*,j]
53
 *
54
 *    Before entering this routine, the original nonzeros in the panel 
55
 *    were already copied into the spa[m,w].
56
 *
57
 *    Updated/Output parameters-
58
 *	dense[0:m-1,w]: L[*,j:j+w-1] and U[*,j:j+w-1] are returned 
59
 *      collectively in the m-by-w vector dense[*]. 
60
 *
61
 */
62
63
#ifdef USE_VENDOR_BLAS
64
#ifdef _CRAY
65
    _fcd ftcs1 = _cptofcd("L", strlen("L")),
66
         ftcs2 = _cptofcd("N", strlen("N")),
67
         ftcs3 = _cptofcd("U", strlen("U"));
68
#endif
69
    int          incx = 1, incy = 1;
70
    doublecomplex       alpha, beta;
71
#endif
72
73
    register int k, ksub;
74
    int          fsupc, nsupc, nsupr, nrow;
75
    int          krep, krep_ind;
76
    doublecomplex       ukj, ukj1, ukj2;
77
    int          luptr, luptr1, luptr2;
78
    int          segsze;
79
    int          block_nrow;  /* no of rows in a block row */
80
    register int lptr;	      /* Points to the row subscripts of a supernode */
81
    int          kfnz, irow, no_zeros; 
82
    register int isub, isub1, i;
83
    register int jj;	      /* Index through each column in the panel */
84
    int          *xsup, *supno;
85
    int          *lsub, *xlsub;
86
    doublecomplex       *lusup;
87
    int          *xlusup;
88
    int          *repfnz_col; /* repfnz[] for a column in the panel */
89
    doublecomplex       *dense_col;  /* dense[] for a column in the panel */
90
    doublecomplex       *tempv1;             /* Used in 1-D update */
91
    doublecomplex       *TriTmp, *MatvecTmp; /* used in 2-D update */
92
    doublecomplex      zero = {0.0, 0.0};
93
    doublecomplex      one = {1.0, 0.0};
94
    doublecomplex      comp_temp, comp_temp1;
95
    register int ldaTmp;
96
    register int r_ind, r_hi;
97
    static   int first = 1, maxsuper, rowblk, colblk;
98
    flops_t  *ops = stat->ops;
99
    
100
    xsup    = Glu->xsup;
101
    supno   = Glu->supno;
102
    lsub    = Glu->lsub;
103
    xlsub   = Glu->xlsub;
104
    lusup   = Glu->lusup;
105
    xlusup  = Glu->xlusup;
106
    
107
    if ( first ) {
108
	maxsuper = sp_ienv(3);
109
	rowblk   = sp_ienv(4);
110
	colblk   = sp_ienv(5);
111
	first = 0;
112
    }
113
    ldaTmp = maxsuper + rowblk;
114
115
    /* 
116
     * For each nonz supernode segment of U[*,j] in topological order 
117
     */
118
    k = nseg - 1;
119
    for (ksub = 0; ksub < nseg; ksub++) { /* for each updating supernode */
120
121
	/* krep = representative of current k-th supernode
122
	 * fsupc = first supernodal column
123
	 * nsupc = no of columns in a supernode
124
	 * nsupr = no of rows in a supernode
125
	 */
126
        krep = segrep[k--];
127
	fsupc = xsup[supno[krep]];
128
	nsupc = krep - fsupc + 1;
129
	nsupr = xlsub[fsupc+1] - xlsub[fsupc];
130
	nrow = nsupr - nsupc;
131
	lptr = xlsub[fsupc];
132
	krep_ind = lptr + nsupc - 1;
133
134
	repfnz_col = repfnz;
135
	dense_col = dense;
136
	
137
	if ( nsupc >= colblk && nrow > rowblk ) { /* 2-D block update */
138
139
	    TriTmp = tempv;
140
	
141
	    /* Sequence through each column in panel -- triangular solves */
142
	    for (jj = jcol; jj < jcol + w; jj++,
143
		 repfnz_col += m, dense_col += m, TriTmp += ldaTmp ) {
144
145
		kfnz = repfnz_col[krep];
146
		if ( kfnz == EMPTY ) continue;	/* Skip any zero segment */
147
	    
148
		segsze = krep - kfnz + 1;
149
		luptr = xlusup[fsupc];
150
151
		ops[TRSV] += 4 * segsze * (segsze - 1);
152
		ops[GEMV] += 8 * nrow * segsze;
153
	
154
		/* Case 1: Update U-segment of size 1 -- col-col update */
155
		if ( segsze == 1 ) {
156
		    ukj = dense_col[lsub[krep_ind]];
157
		    luptr += nsupr*(nsupc-1) + nsupc;
158
159
		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
160
			irow = lsub[i];
161
	    	        zz_mult(&comp_temp, &ukj, &lusup[luptr]);
162
		        z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
163
			++luptr;
164
		    }
165
166
		} else if ( segsze <= 3 ) {
167
		    ukj = dense_col[lsub[krep_ind]];
168
		    ukj1 = dense_col[lsub[krep_ind - 1]];
169
		    luptr += nsupr*(nsupc-1) + nsupc-1;
170
		    luptr1 = luptr - nsupr;
171
172
		    if ( segsze == 2 ) {
173
		        zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
174
		        z_sub(&ukj, &ukj, &comp_temp);
175
			dense_col[lsub[krep_ind]] = ukj;
176
			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
177
			    irow = lsub[i];
178
			    luptr++; luptr1++;
179
			    zz_mult(&comp_temp, &ukj, &lusup[luptr]);
180
			    zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
181
			    z_add(&comp_temp, &comp_temp, &comp_temp1);
182
			    z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
183
			}
184
		    } else {
185
			ukj2 = dense_col[lsub[krep_ind - 2]];
186
			luptr2 = luptr1 - nsupr;
187
  		        zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
188
		        z_sub(&ukj1, &ukj1, &comp_temp);
189
190
		        zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
191
        		zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
192
		        z_add(&comp_temp, &comp_temp, &comp_temp1);
193
		        z_sub(&ukj, &ukj, &comp_temp);
194
			dense_col[lsub[krep_ind]] = ukj;
195
			dense_col[lsub[krep_ind-1]] = ukj1;
196
			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
197
			    irow = lsub[i];
198
			    luptr++; luptr1++; luptr2++;
199
			    zz_mult(&comp_temp, &ukj, &lusup[luptr]);
200
			    zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
201
			    z_add(&comp_temp, &comp_temp, &comp_temp1);
202
			    zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
203
			    z_add(&comp_temp, &comp_temp, &comp_temp1);
204
			    z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
205
			}
206
		    }
207
208
		} else  {	/* segsze >= 4 */
209
		    
210
		    /* Copy U[*,j] segment from dense[*] to TriTmp[*], which
211
		       holds the result of triangular solves.    */
212
		    no_zeros = kfnz - fsupc;
213
		    isub = lptr + no_zeros;
214
		    for (i = 0; i < segsze; ++i) {
215
			irow = lsub[isub];
216
			TriTmp[i] = dense_col[irow]; /* Gather */
217
			++isub;
218
		    }
219
		    
220
		    /* start effective triangle */
221
		    luptr += nsupr * no_zeros + no_zeros;
222
223
#ifdef USE_VENDOR_BLAS
224
#ifdef _CRAY
225
		    CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
226
			   &nsupr, TriTmp, &incx );
227
#else
228
		    ztrsv_( "L", "N", "U", &segsze, &lusup[luptr], 
229
			   &nsupr, TriTmp, &incx );
230
#endif
231
#else		
232
		    zlsolve ( nsupr, segsze, &lusup[luptr], TriTmp );
233
#endif
234
		    
235
236
		} /* else ... */
237
	    
238
	    }  /* for jj ... end tri-solves */
239
240
	    /* Block row updates; push all the way into dense[*] block */
241
	    for ( r_ind = 0; r_ind < nrow; r_ind += rowblk ) {
242
		
243
		r_hi = SUPERLU_MIN(nrow, r_ind + rowblk);
244
		block_nrow = SUPERLU_MIN(rowblk, r_hi - r_ind);
245
		luptr = xlusup[fsupc] + nsupc + r_ind;
246
		isub1 = lptr + nsupc + r_ind;
247
		
248
		repfnz_col = repfnz;
249
		TriTmp = tempv;
250
		dense_col = dense;
251
		
252
		/* Sequence through each column in panel -- matrix-vector */
253
		for (jj = jcol; jj < jcol + w; jj++,
254
		     repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
255
		    
256
		    kfnz = repfnz_col[krep];
257
		    if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
258
		    
259
		    segsze = krep - kfnz + 1;
260
		    if ( segsze <= 3 ) continue;   /* skip unrolled cases */
261
		    
262
		    /* Perform a block update, and scatter the result of
263
		       matrix-vector to dense[].		 */
264
		    no_zeros = kfnz - fsupc;
265
		    luptr1 = luptr + nsupr * no_zeros;
266
		    MatvecTmp = &TriTmp[maxsuper];
267
		    
268
#ifdef USE_VENDOR_BLAS
269
		    alpha = one; 
270
                    beta = zero;
271
#ifdef _CRAY
272
		    CGEMV(ftcs2, &block_nrow, &segsze, &alpha, &lusup[luptr1], 
273
			   &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
274
#else
275
		    zgemv_("N", &block_nrow, &segsze, &alpha, &lusup[luptr1], 
276
			   &nsupr, TriTmp, &incx, &beta, MatvecTmp, &incy);
277
#endif
278
#else
279
		    zmatvec(nsupr, block_nrow, segsze, &lusup[luptr1],
280
			   TriTmp, MatvecTmp);
281
#endif
282
		    
283
		    /* Scatter MatvecTmp[*] into SPA dense[*] temporarily
284
		     * such that MatvecTmp[*] can be re-used for the
285
		     * the next blok row update. dense[] will be copied into 
286
		     * global store after the whole panel has been finished.
287
		     */
288
		    isub = isub1;
289
		    for (i = 0; i < block_nrow; i++) {
290
			irow = lsub[isub];
291
		        z_sub(&dense_col[irow], &dense_col[irow], 
292
                              &MatvecTmp[i]);
293
			MatvecTmp[i] = zero;
294
			++isub;
295
		    }
296
		    
297
		} /* for jj ... */
298
		
299
	    } /* for each block row ... */
300
	    
301
	    /* Scatter the triangular solves into SPA dense[*] */
302
	    repfnz_col = repfnz;
303
	    TriTmp = tempv;
304
	    dense_col = dense;
305
	    
306
	    for (jj = jcol; jj < jcol + w; jj++,
307
		 repfnz_col += m, dense_col += m, TriTmp += ldaTmp) {
308
		kfnz = repfnz_col[krep];
309
		if ( kfnz == EMPTY ) continue; /* Skip any zero segment */
310
		
311
		segsze = krep - kfnz + 1;
312
		if ( segsze <= 3 ) continue; /* skip unrolled cases */
313
		
314
		no_zeros = kfnz - fsupc;		
315
		isub = lptr + no_zeros;
316
		for (i = 0; i < segsze; i++) {
317
		    irow = lsub[isub];
318
		    dense_col[irow] = TriTmp[i];
319
		    TriTmp[i] = zero;
320
		    ++isub;
321
		}
322
		
323
	    } /* for jj ... */
324
	    
325
	} else { /* 1-D block modification */
326
	    
327
	    
328
	    /* Sequence through each column in the panel */
329
	    for (jj = jcol; jj < jcol + w; jj++,
330
		 repfnz_col += m, dense_col += m) {
331
		
332
		kfnz = repfnz_col[krep];
333
		if ( kfnz == EMPTY ) continue;	/* Skip any zero segment */
334
		
335
		segsze = krep - kfnz + 1;
336
		luptr = xlusup[fsupc];
337
338
		ops[TRSV] += 4 * segsze * (segsze - 1);
339
		ops[GEMV] += 8 * nrow * segsze;
340
		
341
		/* Case 1: Update U-segment of size 1 -- col-col update */
342
		if ( segsze == 1 ) {
343
		    ukj = dense_col[lsub[krep_ind]];
344
		    luptr += nsupr*(nsupc-1) + nsupc;
345
346
		    for (i = lptr + nsupc; i < xlsub[fsupc+1]; i++) {
347
			irow = lsub[i];
348
	    	        zz_mult(&comp_temp, &ukj, &lusup[luptr]);
349
		        z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
350
			++luptr;
351
		    }
352
353
		} else if ( segsze <= 3 ) {
354
		    ukj = dense_col[lsub[krep_ind]];
355
		    luptr += nsupr*(nsupc-1) + nsupc-1;
356
		    ukj1 = dense_col[lsub[krep_ind - 1]];
357
		    luptr1 = luptr - nsupr;
358
359
		    if ( segsze == 2 ) {
360
		        zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
361
		        z_sub(&ukj, &ukj, &comp_temp);
362
			dense_col[lsub[krep_ind]] = ukj;
363
			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
364
			    irow = lsub[i];
365
			    ++luptr;  ++luptr1;
366
			    zz_mult(&comp_temp, &ukj, &lusup[luptr]);
367
			    zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
368
			    z_add(&comp_temp, &comp_temp, &comp_temp1);
369
			    z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
370
			}
371
		    } else {
372
			ukj2 = dense_col[lsub[krep_ind - 2]];
373
			luptr2 = luptr1 - nsupr;
374
  		        zz_mult(&comp_temp, &ukj2, &lusup[luptr2-1]);
375
		        z_sub(&ukj1, &ukj1, &comp_temp);
376
377
		        zz_mult(&comp_temp, &ukj1, &lusup[luptr1]);
378
        		zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
379
		        z_add(&comp_temp, &comp_temp, &comp_temp1);
380
		        z_sub(&ukj, &ukj, &comp_temp);
381
			dense_col[lsub[krep_ind]] = ukj;
382
			dense_col[lsub[krep_ind-1]] = ukj1;
383
			for (i = lptr + nsupc; i < xlsub[fsupc+1]; ++i) {
384
			    irow = lsub[i];
385
			    ++luptr; ++luptr1; ++luptr2;
386
			    zz_mult(&comp_temp, &ukj, &lusup[luptr]);
387
			    zz_mult(&comp_temp1, &ukj1, &lusup[luptr1]);
388
			    z_add(&comp_temp, &comp_temp, &comp_temp1);
389
			    zz_mult(&comp_temp1, &ukj2, &lusup[luptr2]);
390
			    z_add(&comp_temp, &comp_temp, &comp_temp1);
391
			    z_sub(&dense_col[irow], &dense_col[irow], &comp_temp);
392
			}
393
		    }
394
395
		} else  { /* segsze >= 4 */
396
		    /* 
397
		     * Perform a triangular solve and block update,
398
		     * then scatter the result of sup-col update to dense[].
399
		     */
400
		    no_zeros = kfnz - fsupc;
401
		    
402
		    /* Copy U[*,j] segment from dense[*] to tempv[*]: 
403
		     *    The result of triangular solve is in tempv[*];
404
		     *    The result of matrix vector update is in dense_col[*]
405
		     */
406
		    isub = lptr + no_zeros;
407
		    for (i = 0; i < segsze; ++i) {
408
			irow = lsub[isub];
409
			tempv[i] = dense_col[irow]; /* Gather */
410
			++isub;
411
		    }
412
		    
413
		    /* start effective triangle */
414
		    luptr += nsupr * no_zeros + no_zeros;
415
		    
416
#ifdef USE_VENDOR_BLAS
417
#ifdef _CRAY
418
		    CTRSV( ftcs1, ftcs2, ftcs3, &segsze, &lusup[luptr], 
419
			   &nsupr, tempv, &incx );
420
#else
421
		    ztrsv_( "L", "N", "U", &segsze, &lusup[luptr], 
422
			   &nsupr, tempv, &incx );
423
#endif
424
		    
425
		    luptr += segsze;	/* Dense matrix-vector */
426
		    tempv1 = &tempv[segsze];
427
                    alpha = one;
428
                    beta = zero;
429
#ifdef _CRAY
430
		    CGEMV( ftcs2, &nrow, &segsze, &alpha, &lusup[luptr], 
431
			   &nsupr, tempv, &incx, &beta, tempv1, &incy );
432
#else
433
		    zgemv_( "N", &nrow, &segsze, &alpha, &lusup[luptr], 
434
			   &nsupr, tempv, &incx, &beta, tempv1, &incy );
435
#endif
436
#else
437
		    zlsolve ( nsupr, segsze, &lusup[luptr], tempv );
438
		    
439
		    luptr += segsze;        /* Dense matrix-vector */
440
		    tempv1 = &tempv[segsze];
441
		    zmatvec (nsupr, nrow, segsze, &lusup[luptr], tempv, tempv1);
442
#endif
443
		    
444
		    /* Scatter tempv[*] into SPA dense[*] temporarily, such
445
		     * that tempv[*] can be used for the triangular solve of
446
		     * the next column of the panel. They will be copied into 
447
		     * ucol[*] after the whole panel has been finished.
448
		     */
449
		    isub = lptr + no_zeros;
450
		    for (i = 0; i < segsze; i++) {
451
			irow = lsub[isub];
452
			dense_col[irow] = tempv[i];
453
			tempv[i] = zero;
454
			isub++;
455
		    }
456
		    
457
		    /* Scatter the update from tempv1[*] into SPA dense[*] */
458
		    /* Start dense rectangular L */
459
		    for (i = 0; i < nrow; i++) {
460
			irow = lsub[isub];
461
		        z_sub(&dense_col[irow], &dense_col[irow], &tempv1[i]);
462
			tempv1[i] = zero;
463
			++isub;	
464
		    }
465
		    
466
		} /* else segsze>=4 ... */
467
		
468
	    } /* for each column in the panel... */
469
	    
470
	} /* else 1-D update ... */
471
472
    } /* for each updating supernode ... */
473
474
}
475
476
477
478