2
* Automatically Tuned Linear Algebra Software v3.10.1
3
* Copyright (C) 2009 Siju Samuel
5
* Code contributers : Siju Samuel, Anthony M. Castaldo, R. Clint Whaley
7
* Redistribution and use in source and binary forms, with or without
8
* modification, are permitted provided that the following conditions
10
* 1. Redistributions of source code must retain the above copyright
11
* notice, this list of conditions and the following disclaimer.
12
* 2. Redistributions in binary form must reproduce the above copyright
13
* notice, this list of conditions, and the following disclaimer in the
14
* documentation and/or other materials provided with the distribution.
15
* 3. The name of the ATLAS group or the names of its contributers may
16
* not be used to endorse or promote products derived from this
17
* software without specific written permission.
19
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
23
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29
* POSSIBILITY OF SUCH DAMAGE.
33
* This is the C translation of the standard LAPACK Fortran routine:
34
* SUBROUTINE DGEQR2( M, N, A, LDA, TAU, WORK, INFO )
37
* Scaling LAPACK Panel Operations Using Parallel Cache Assignment
38
* Principles and Practice of Parallel Programming (PPoPP)' 10
39
* Jan 9-14, 2010, Bangalore, India
40
* by Anthony M. Castaldo, R.Clint Whaley
44
* int ATL_geql2_t( const int M, const int N, TYPE *A, int LDA,
45
* TYPE *TAU, TYPE *WORK)
46
* NOTE :a) ATL_tgeqr2.c will get compiled to four precisions
47
* single precision real, double precision real
48
* single precision complex, double precision complex
50
* b) This routine will not validate the input parameters.
62
* The number of rows of the matrix A. M >= 0.
65
* The number of columns of the matrix A. N >= 0.
67
* A (input/output) array, dimension (LDA,N)
68
* On entry, the m by n matrix A.
69
* On exit, the elements on and above the diagonal of the array
70
* contain the min(m,n) by n upper trapezoidal matrix R (R is
71
* upper triangular if m >= n); the elements below the diagonal,
72
* with the array TAU, represent the orthogonal matrix Q
73
* (unitary matrix incase of complex precision ) as a
74
* product of elementary reflectors (see Further Details).
77
* The leading dimension of the array A. LDA >= max(1,M).
79
* TAU (output) array, dimension (min(M,N))
80
* The scalar factors of the elementary reflectors (see Further
83
* WORK (workspace) array, dimension (N)
85
* INFO (output) INTEGER
86
* = 0: successful exit
87
* < 0: if INFO = -i, the i-th argument had an illegal value
92
* The matrix Q is represented as a product of elementary reflectors
94
* Q = H(1) H(2) . . . H(k), where k = min(m,n).
95
* (for Real/Complex Precision)
96
* Each H(i) has the form
98
* H(i) = I - tau * v * v' (for Real Precision)
99
* H(i) = I - tau * v * conjugate(v)' (for Complex Precision)
101
* where tau is a real/complex scalar, and v is a real/complex vector with
102
* v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A(i+1:m,i),
106
* Threads are managed as below. Instead of having
107
* thread zero manage the computation by specifying tasks to workers, we
108
* have workers proceed automatically to their next task; looping through
109
* each of the columns on their own and only synchronizing when necessary.
111
* We launch and join as below. For example:
113
* Core To Thread Mapping
115
* Core Binary Reversed Decimal Busy Map
116
* 0 000 000 0 X.......
117
* 1 001 100 4 X...X...
118
* 2 010 010 2 X.X.X...
119
* 3 011 110 6 X.X.X.X.
120
* 4 100 001 1 XXX.X.X.
121
* 5 101 101 5 XXX.XXX.
122
* 6 110 011 3 XXXXXXX.
123
* 7 111 111 7 XXXXXXXX
125
* Worker Thread assigned as below : (siju check it is thread or core ? TODO)
127
* Thread Binary #ofTrailing ReplaceTrailing Waits for
128
* zeroes bits by 1 (+ core)
130
* 0 000 3 +1 +2 +4 1 2 4
139
* There is no trivial non-looping method of reversing the binary digits,
140
* so we use an integer array for the indices, and another array to keep
141
* track of working cores. The join operation for a node just skips the
142
* core if it is inactive.
144
* For example, core 0 must join 1, 2, 4, but if only three threads were
145
* engaged, it skips the join on thread 1, joins 2, and then joins 4.
146
* Thread 2 would normally need to combine 3, but since 3 is inactive, it
147
* (Thread 2) will just post itself done immediately.
149
* This approach balances the binary tree and lets us use our existing
156
#include "atlas_tlapack.h"
157
#include "atlas_lapack.h"
158
#include "atlas_threads.h"
159
#include "atlas_tlvl3.h"
160
#include "atlas_level2.h"
161
#include "atlas_lamch.h"
166
static const TYPE ONE = ATL_rone;
168
static const TYPE ONE[2] = {ATL_rone, ATL_rzero};
171
#define MY_align 64 /* ONLY powers of 2 work. */
174
#define MY_gemv ATL_sgemv /* L2 tuned dgemv */
175
#define MY_trmv ATL_strmv /* L2 tuned dgemv */
176
#define MY_gemvT ATL_sgemvT_L2 /* L2 tuned dgemv */
177
#define MY_gemvN ATL_sgemvN /* L2 tuned dgemv */
178
#define ATL_axpy ATL_saxpy
179
#define MY_ger ATL_sger_L2 /* L2 tuned dger */
180
#define MY_gecopy ATL_sgecopy /* Copy routine to use. */
183
#define MY_gemv_Tony ATL_dgemv_L2 /* L2 tuned dgemv */
184
#define MY_trmv ATL_dtrmv /* L2 tuned dgemv */
185
#define MY_gemvT ATL_dgemvT_L2 /* L2 tuned dgemv */
186
#define MY_gemvN ATL_dgemvN /* L2 tuned dgemv */
187
#define ATL_axpy ATL_daxpy
188
#define MY_ger ATL_dger_L2 /* L2 tuned dger */
189
#define MY_gecopy ATL_dgecopy /* Copy routine to use. */
192
#define MY_trmv ATL_ctrmv /* L2 tuned dgemv */
193
#define MY_gemvT ATL_cgemvCT_L2 /* L2 tuned dgemv */
194
#define MY_gemvN ATL_cgemvN /* L2 tuned dgemv */
195
#define ATL_axpy ATL_caxpy
196
#define MY_gemv ATL_cgemv /* L2 tuned dgemv */
197
#define MY_ger ATL_cgerc_L2 /* L2 tuned dger */
198
#define MY_gecopy ATL_cgecopy /* Copy routine to use. */
201
#define MY_trmv ATL_dtrmv /* L2 tuned dgemv */
202
#define MY_gemvT ATL_zgemvCT_L2 /* L2 tuned dgemv */
203
#define MY_gemvN ATL_zgemvN /* L2 tuned dgemv */
204
#define ATL_axpy ATL_zaxpy
205
#define MY_gemv ATL_zgemv /* L2 tuned dgemv */
206
#define MY_ger ATL_zgerc_L2 /* L2 tuned dger */
207
#define MY_gecopy ATL_zgecopy /* Copy routine to use. */
211
#define MY_TRANS CblasTrans
213
#define MY_TRANS CblasConjTrans
217
#define ATL_ger_L2 Mjoin(PATL,ger_L2)
218
#define ATL_geqr2_dnrm2 Mjoin(PATL,geqr2_dnrm2)
219
#define ATL_geqr2_LC_Setup Mjoin(PATL,geqr2_LC_Setup)
220
#define ATL_geqr2_DoCopy Mjoin(PATL,geqr2_DoCopy)
221
#define ATL_geqr2_UnCopy Mjoin(PATL,geqr2_UnCopy)
222
#define ATL_geqr2_Cache Mjoin(PATL,geqr2_Cache)
223
#define ATL_geqr2Worker Mjoin(PATL,geqr2Worker)
224
#define ATL_geqr2Worker_Zero Mjoin(PATL,geqr2Worker_Zero)
225
#define ATL_tgeqr2 Mjoin(PATL,tgeqr2)
226
#define ATL_geqr2_Order Mjoin(PATL,geqr2_Order)
228
double time00(void); /* prototype external rtn. */
229
/*----------------------------------------------------------------------------*/
230
/* rdtsc reads the time stamp counter into %rax, which will be aliased as a */
231
/* temp for the memory location 'time' which MUST be a long long location!!! */
232
/* This takes about 30 cycles to complete. */
233
/*----------------------------------------------------------------------------*/
234
#define rdtsc(time) \
235
__asm__ __volatile__ ( \
237
"shlq $32, %%rdx;\n\t" \
238
"addq %%rdx, %%rax;\n\t" \
239
: "=a"(time) : : "%rdx" );
242
/*----------------------------------------------------------------------------*/
243
/* Align the code. */
244
/*----------------------------------------------------------------------------*/
246
__asm__ __volatile__ ( \
250
//int ATL_geqr2_Order[ATL_NTHREADS]={0,4,2,6,1,5,3,7};
251
//int ATL_geqr2_Order[ATL_NTHREADS]={0,2,1,3};
253
typedef struct /* Threading structure. */
262
volatile ATL_INT dnrm2;
263
volatile ATL_INT dgemv;
269
volatile TYPE zDiag[2];
280
volatile ATL_INT dgemvt;
284
/* This code is used three times below, macro to prevent bug propagation. */
285
#define dnrm2_combine \
286
if (myTS->Scale > ptnr->Scale) /* If my scale is bigger, */ \
288
w = (ptnr->Scale)/(myTS->Scale); /* Must scale his SSQ. */ \
289
myTS->SSQ += (ptnr->SSQ)*w*w; /* Add scaled version. */ \
290
} else /* His scale is bigger. */ \
292
w = (myTS->Scale)/(ptnr->Scale); /* Must scale my SSQ. */ \
293
myTS->SSQ *= (w*w); /* Scale my SSQ first. */ \
294
myTS->SSQ += ptnr->SSQ; /* Add his to mine. */ \
295
myTS->Scale = ptnr->Scale; /* Replace my SSQ with his.*/ \
296
} /* END #define dnrm2_combine */
299
//double dgeqr2_SAFMIN=-1.;
301
/*-----------------------------------------------------------------------------
302
* ATL_dgeqr2_dnrm2: Compute a dnrm2, but retain scale and value.
303
*---------------------------------------------------------------------------*/
305
static void ATL_geqr2_dnrm2(ATL_DGEQR2_t *ts)
307
int i, N=ts->myM, N2;
313
TYPE aX, sX, SSQ=0., Scale=1.0;
314
TYPE *X=ts->A+(((ts->lda) SHIFT)*(ts->myK));
316
if (ts->rank != 0) /* If a standard compute, */
318
for (i=0; i<N2; i++) /* Do the full column. */
325
SSQ = 1.+SSQ*(Scale/aX)*(Scale/aX);
329
SSQ += (aX/Scale)*(aX/Scale);
333
} else /* This is the Rank 0 Operation */
335
i = ((ts->myK) SHIFT) +(1 SHIFT); /* Might as well shortcut. */
336
for (; i<N2; i++) /* 'i' already initialized. */
343
sX = Scale/aX; /* Do scale only once. */
344
SSQ = 1.+SSQ*sX*sX; /* Scale SSQ. */
345
Scale = aX; /* Use the new Scale. */
348
sX = aX/Scale; /* Scale the value. */
349
SSQ += sX*sX; /* Square and add to SSQ. */
357
// fprintf(stderr, "scale %f ssq %f \n", Scale, SSQ );
358
} /* END ATL_dgeqr2_dnrm2 */
362
* LC_Setup sets up copy.
364
static size_t ATL_geqr2_LC_Setup(ATL_DGEQR2_t *myTS)
366
int N = myTS->fullN; /* width of array. */
367
int M = myTS->myM; /* height of array. */
368
myTS->oldLDA = myTS->lda; /* Remember the old one. */
369
myTS->oldA = myTS->A; /* ... */
370
int LDA = (M+1)&(-2); /* My new LDA, multiple 2. */
371
myTS->lda = LDA; /* Store new LDA. */
372
size_t memSize = ((LDA SHIFT)*N*sizeof(TYPE));/* Room I need. */
373
memSize = (memSize+MY_align-1)&(-MY_align); /* As multiple of MY_align. */
374
return(memSize); /* Exit with amt needed. */
375
} /* ** END ATL_geqr2_LC_Setup ** */
379
* DoCopy works from LC_Setup.
381
static void ATL_geqr2_DoCopy(ATL_DGEQR2_t *myTS)
383
int N = myTS->fullN; /* width of array. */
384
int M = myTS->myM; /* height of array. */
387
Mjoin(PATL,gemoveT)(M, N, ONE, myTS->oldA, /* First arg is source, */
388
myTS->oldLDA, myTS->A, myTS->lda); /* This is destination. */
390
MY_gecopy(M, N, myTS->oldA, myTS->oldLDA, /* First arg is source, */
391
myTS->A, myTS->lda); /* This is destination. */
393
} /* END ATL_geqr2_DoCopy. */
397
* UnCopy is just a copy back, no free.
399
static void ATL_geqr2_UnCopy(ATL_DGEQR2_t *myTS)
401
int N = myTS->fullN; /* width of array. */
402
int M = myTS->myM; /* height of array. */
404
Mjoin(PATL,gemoveT)(N, M , ONE,myTS->A,
405
myTS->lda, myTS->oldA, myTS->oldLDA);
407
MY_gecopy(M, N, myTS->A, myTS->lda,
408
myTS->oldA, myTS->oldLDA);
410
myTS->lda = myTS->oldLDA; /* Restore the old one. */
411
myTS->A = myTS->oldA; /* ... */
412
} /* END ATL_geqr2_UnCopy. */
416
* Cache will force a read every 64th byte (eighth double) in an array,
419
static void ATL_geqr2_Cache(ATL_DGEQR2_t *myTS)
422
int N = myTS->fullN; /* width of array. */
423
int M = myTS->myM; /* height of array. */
428
for (c=N-1; c>=0; c--) /* Every column, backwards. */
430
for (r=((M-1) SHIFT); r>=0; r-=8) /* every eighth double. */
432
x = *(A+(LDA SHIFT)*c+r); /* read the variable. */
435
} /* ** END ATL_dgeqr2_Cache ** */
439
* Callback for thread launcher
441
static int Mjoin(PATL,StructIsInitGEQR2)(void *vp)
444
return(((ATL_DGEQR2_t*)vp)->active);
449
* ATL_geqr2Worker: Persistent for the duration of the DGEQR2 operation.
450
* Argument is pointer to a structure which contains our data section.
452
//void* ATL_geqr2Worker(void *myArg)
453
static void* ATL_geqr2Worker(ATL_LAUNCHSTRUCT_t *lp, void *vp)
455
ATL_thread_t *tp = vp;
456
ATL_DGEQR2_t *myTS = ((ATL_DGEQR2_t*) lp->opstruct)+tp->rank;
457
ATL_DGEQR2_t *zTS = myTS - (myTS->rank); /* Point at zero ts. */
458
int zees, pair, myRank; /* Log2 looping variables. */
459
int i,j,M,N,LDA,mScale,mUpdate,newN,KNT,LDT;
461
int myCopy = myTS->copy;
462
volatile ATL_DGEQR2_t *ptnr; /* partner log2 combine. */
463
TYPE *A, *scaleA, *T; /* Work variables. */
464
TYPE XNORM, BETA, BETAp;
465
TYPE *TAU = myTS->TAU; /* Short cut. */
467
TYPE w; /* Used in dnrm2_combine mac.*/
469
const TYPE ONE = ATL_rone;
475
const TYPE ZERO = ATL_rzero;
477
const TYPE ONE[2] = {ATL_rone, ATL_rzero};
487
const TYPE ZERO[2] = {ATL_rzero, ATL_rzero};
490
myRank = myTS->rank; /* Get my rank. */
495
/*----------------------------------------------------------------------------*/
496
/* Now we begin the real dgeqr2. */
497
/*----------------------------------------------------------------------------*/
498
N = myTS->fullN; /* panel width. */
502
ATL_geqr2_DoCopy(myTS); /* Execute it. */
505
LDA = myTS->lda; /* Load AFTER local copy. */
506
M = myTS->myM; /* Shortcut to M. */
508
for (i=0; i<N; i++) /* Now, for each column, */
510
if (myRank == 0) /* Zero follows diagonal. */
512
mScale = M-i-1; /* Special scaling value. */
513
mUpdate = M-i; /* Special gemv/ger size. */
514
A = myTS->A + (i SHIFT)+ (i* (LDA SHIFT)); /* Special pointer to A. */
516
myTS->zDiag = (*A); /* Get the diagonal. */
519
myTS->zDiag[0] = (*A);
520
myTS->zDiag[1] = *(A+1);
521
/* Replace with one now. */
525
scaleA = A+(1 SHIFT); /* What to scale. */
526
} else /* Others keep square. */
528
mScale = M; /* Always scale full col. */
529
mUpdate = M; /* Always gemv/ger full co. */
530
A = myTS->A + (i*(LDA SHIFT)); /* Point at column. */
531
scaleA = A; /* What to scale. */
534
myTS->myK = i; /* Set my K value. */
536
ATL_geqr2_dnrm2(myTS);
538
zees = myRank; /* Init the test flags. */
539
pair = 1; /* Starting pair. */
540
while ( (zees & 1) == 0 && /* If I must wait on pair, */
541
(myRank)+pair < ATL_NTHREADS) /* ..and pair exists, */
543
ptnr = myTS+pair; /* Point at my partner. */
545
if (ptnr->active == 1) /* If ptnr was used, */
547
while (ptnr->dnrm2 <= myTS->dnrm2); /* Wait for it. */
551
zees >>= 1; /* Shift a zero out. */
552
pair <<= 1; /* Double the pair idx. */
555
myTS->dnrm2++; /* Signal I am done. */
557
/*************************************/
558
/****** S Y N C P O I N T ********/
559
/****** S Y N C P O I N T ********/
560
/****** S Y N C P O I N T ********/
561
/*************************************/
563
/*-----------------------------------------------------------------*/
564
/* Note: We can avoid syncing immediately after the DGER, but DGER */
565
/* does use zTS->WORK as its vector. So here is the point where we */
566
/* have to make sure everybody's DGER is done, and they have done */
567
/* their norm2 work. All threads converge here for the next column */
568
/* and so will be done with their DGER, and zTS->WORK is free. */
569
/*-----------------------------------------------------------------*/
570
while (zTS->dnrm2 < myTS->dnrm2); /* Wait for zero to finish. */
572
/*-------------------------------------------------------------*/
573
/* At this point, zTS->SSQ is computed. If it is zero, then no */
574
/* rotation is needed, and TAU should be set to zero, and we */
575
/* just skip to the next column. */
576
/* HOWEVER, core zero may be fast on the next compare, and */
577
/* change zTS->SSQ before some other core gets a chance to see */
578
/* it was zero. So we must have a barrier here before we go on.*/
580
/* In Complex we require both SSQ == 0 and IMAG(A[i,i]) == 0. */
581
/*-------------------------------------------------------------*/
582
/* If H should be Identity, set TAU to zero and go to next column. */
586
if (zTS->SSQ == 0. && zTS->zDiag[1] == 0.)
587
#endif /* REAL or CPLX */
589
zees = myRank; /* Init the test flags. */
590
pair = 1; /* Starting pair. */
591
while ( (zees & 1) == 0 && /* If I must wait on pair, */
592
(myRank)+pair < ATL_NTHREADS) /* ..and pair exists, */
594
ptnr = myTS+pair; /* Point at my partner. */
596
if (ptnr->active == 1) /* If ptnr was used, */
598
while (ptnr->dnrm2 <= myTS->dnrm2); /* Wait for it. */
601
zees >>= 1; /* Shift a zero out. */
602
pair <<= 1; /* Double the pair idx. */
605
if (myRank == 0) /* Restore A[i,i] we replaced w 1.*/
609
TAU[i] = 0.; /* clear TAU so H[i]=Identity. */
611
*(A ) = zTS->zDiag[0];
612
*(A+1) = zTS->zDiag[1];
613
TAU[(i SHIFT)] = 0.; /* clear TAU so H[i]=Identity. */
614
TAU[(i SHIFT)+1] = 0.;
618
myTS->dnrm2++; /* Signal I am done. */
619
while (zTS->dnrm2 < myTS->dnrm2); /* Wait for zero to finish. */
620
continue; /* ..Proceed to next column. */
621
} /* END if H=Identity no need to process column. */
623
/*----------------------------------------------------------------------------*/
624
/* Here, H is not identity, we 'continued' the loop if it was. */
625
/*----------------------------------------------------------------------------*/
626
XNORM = (zTS->Scale)*sqrt(zTS->SSQ); /* Compute the norm. */
629
/*----------------------------------------------------------------------------*/
630
/* The following code is inlined from ATL_larfg; the main difference is */
631
/* that we use zTS->zDiag instead of ALPHA, and recompute is parallel. */
632
/*----------------------------------------------------------------------------*/
633
BETAp = ATL_lapy2((zTS->zDiag), XNORM); /* Get sqrt(a^2+b^2) */
634
BETA = BETAp; /* Assume diagonal < 0 ... */
635
if ((zTS->zDiag) >= 0.) BETA = -BETAp; /* .. If >= 0, change sign. */
637
KNT = 0; /* Init power to zero. */
638
if (BETAp < ATL_laSAFMIN)
640
RSAFMN = ATL_rone / ATL_laSAFMIN; /* Set a maximum. */
642
/*---------------------------------------------------------------*/
643
/* BETAp is the same for all cores, so this loop can be executed */
644
/* independently. However, XNORM must be computed in concert. */
645
/* The new BETAp will be at most 1, at least SAFMIN. */
646
/*---------------------------------------------------------------*/
647
while (BETAp < ATL_laSAFMIN)
649
KNT++; /* increment power. */
650
if ( myTS->active == 1) /* If I am active, */
652
cblas_scal(mScale, RSAFMN,
653
scaleA, 1); /* Scale my share. */
658
if (myRank==0) zTS->zDiag *= RSAFMN; /* Only done by core 0! */
661
ATL_geqr2_dnrm2(myTS); /* Do my share of new norm2. */
663
zees = myRank; /* Init the test flags. */
664
pair = 1; /* Starting pair. */
665
while ( (zees & 1) == 0 && /* If I must wait on pair, */
666
(myRank)+pair < ATL_NTHREADS) /* ..and pair exists, */
668
ptnr = myTS+pair; /* Point at my partner. */
670
if (ptnr->active == 1) /* If ptnr was used, */
672
while (ptnr->dnrm2 <= myTS->dnrm2); /* Wait for it. */
676
zees >>= 1; /* Shift a zero out. */
677
pair <<= 1; /* Double the pair idx. */
680
myTS->dnrm2++; /* Signal I am done. */
682
/*************************************/
683
/****** S Y N C P O I N T ********/
684
/****** S Y N C P O I N T ********/
685
/****** S Y N C P O I N T ********/
686
/*************************************/
687
while (zTS->dnrm2 < myTS->dnrm2); /* Wait on zero to finish. */
689
XNORM = (zTS->Scale)*sqrt(zTS->SSQ); /* Compute the norm. */
690
BETAp = ATL_lapy2((zTS->zDiag), XNORM); /* Get sqrt(a^2+b^2) */
691
BETA = BETAp; /* Assume diagonal < 0 ... */
692
if ((zTS->zDiag) >= 0.) BETA = 0.-BETAp; /* ..If >= 0, change sign. */
695
myTAUi = (BETA-(zTS->zDiag)) / BETA; /* Compute TAU[i]. */
696
if (myRank == 0) TAU[i] = myTAUi; /* Set if I own TAU. */
698
sc = ATL_rone/((zTS->zDiag)-BETA); /* Find scaling factor. */
699
if ( myTS->active == 1)
701
cblas_scal(mScale, sc, scaleA, 1); /* Scale the vector. */
704
if (myRank == 0) /* If I own diagonal, */
706
AII = BETA; /* Set new A[i,i] element. */
707
for (j=0; j<KNT; j++) /* Rescaling loop... */
708
AII *= ATL_laSAFMIN; /* ...Adjust it. */
711
#else /* COMPLEX VERSION of LARFG, modeled on clarfg.f. */
712
ALPHAR = zTS->zDiag[0]; /* Real portion. */
713
ALPHAI = zTS->zDiag[1]; /* Imaginary portion. */
715
BETAp = ATL_lapy3(ALPHAR, ALPHAI, XNORM); /* sqrt(a^2+b^2,c^2) */
716
BETA = BETAp; /* Assume ALPHAR < 0.*/
717
if (ALPHAR >= 0.) BETA = -BETAp; /* If >=0, Change sign. */
719
KNT = 0; /* Init power to zero. */
720
if (BETAp < ATL_laSAFMIN)
722
RSAFMN[0] = ATL_rone/ATL_laSAFMIN; /* Set a maximum. */
723
RSAFMN[1] = ATL_rzero; /* ..for scaling. */
725
/*---------------------------------------------------------------*/
726
/* BETAp is the same for all cores, so this loop can be executed */
727
/* independently. However, XNORM must be computed in concert. */
728
/* The new BETAp will be at most 1, at least SAFMIN. */
729
/*---------------------------------------------------------------*/
730
while (BETAp < ATL_laSAFMIN)
732
KNT++; /* increment power. */
733
if ( myTS->active == 1) /* If I am active, */
735
cblas_scal(mScale, RSAFMN,
736
scaleA, 1); /* Scale my share. */
745
ATL_geqr2_dnrm2(myTS); /* Do my share of new norm2. */
747
zees = myRank; /* Init the test flags. */
748
pair = 1; /* Starting pair. */
749
while ( (zees & 1) == 0 && /* If I must wait on pair, */
750
(myRank)+pair < ATL_NTHREADS) /* ..and pair exists, */
752
ptnr = myTS+pair; /* Point at my partner. */
754
if (ptnr->active == 1) /* If ptnr was used, */
756
while (ptnr->dnrm2 <= myTS->dnrm2); /* Wait for it. */
760
zees >>= 1; /* Shift a zero out. */
761
pair <<= 1; /* Double the pair idx. */
764
myTS->dnrm2++; /* Signal I am done. */
766
/*************************************/
767
/****** S Y N C P O I N T ********/
768
/****** S Y N C P O I N T ********/
769
/****** S Y N C P O I N T ********/
770
/*************************************/
771
while (zTS->dnrm2 < myTS->dnrm2); /* Wait on zero to finish.*/
773
XNORM = (zTS->Scale)*sqrt(zTS->SSQ); /* Compute the norm. */
774
BETAp = ATL_lapy3(ALPHAR, ALPHAI, XNORM); /* sqrt(a^2+b^2,c^2) */
775
BETA = BETAp; /* Assume ALPHAR < 0.*/
776
if (ALPHAR >= 0.) BETA = 0.-BETAp; /* If >=0, Change sign. */
779
myTAUi[0] = (BETA-ALPHAR) / BETA; /* Compute real part. */
780
myTAUi[1] = (0.-ALPHAI) / BETA; /* Compute imag part. */
782
ALPHADIV[0] = ALPHAR - BETA; /* prepare for 1/(alpha-beta) */
783
ALPHADIV[1] = ALPHAI; /* ... */
784
ATL_ladiv(ONE, ALPHADIV, sc); /* compute scaling factor. */
785
if (myTS->active == 1) /* If I have some of vector, */
787
cblas_scal(mScale, sc, scaleA, 1); /* ..scale it. */
790
if (myRank == 0) /* If I own TAU, */
792
TAU[(i SHIFT)] = myTAUi[0];
794
TAU[(i SHIFT) +1] = -myTAUi[1]; /* LQ2 needs conjugate. */
796
TAU[(i SHIFT) +1] = myTAUi[1]; /* otherwise use normal. */
800
if (myRank == 0) /* If I own diagonal, */
802
for (j=0; j<KNT; j++) BETA *= ATL_laSAFMIN; /* Rescale BETA. */
803
AII[0] = BETA; /* save for later. */
806
#endif /* COMPLEX version of larfg. */
808
/*----------------------------------------------------------------------------*/
809
/* Now we apply dlarf, if we are not on the last column. This is */
810
/* a dgemv, followed by a dger, all presuming TAU is non-zero. */
811
/* The DGEMV: Column major, transpose A. */
813
/* We must compute H(i)*C, where C is the trailing part of our panel */
814
/* at A[i..M-1, (i+1)..N-1]. So, C is (M-i) x (N-i-1). */
815
/* H(i) = (I-TAU[i]* u * (transpose u)), by definition. */
816
/* Where 'u' = A[i..M-1, i]. So, u is (M-i) x 1. We compute H(i)*C */
817
/* as C - TAU[i] * u * (transpose w), where w = (transpose C) * u */
818
/* (so that (transpose w) = (transpose u) * C.) */
819
/* Thus, w is (N-i-1) x 1. */
821
/* Now, (transpose C) * u is just a GEMV. It produces a vector of */
822
/* (N-i-1) elements. Every core will produce its own copy, and they */
823
/* must be added together. Pictorially, the pieces look like this: */
825
/* B R R R R R R R R Q = finished part of column */
826
/* Q B R R R R R R R R = upper triangular part for column */
827
/* Q Q B R R R R R R B = Betas (norm 2) stored on diagonal, */
828
/* Q Q Q 1 C C C C C 1 = forced unit (normally assumed) */
829
/* Q Q Q u C C C C C u = vector that H(i) is computed by, */
830
/* Q Q Q u C C C C C C = portion of panel to be updated. */
832
/* . . . The '1' will later be replaced by BETA for */
833
/* . . . the column 'u'. */
834
/* Q Q Q u C C C C C */
836
/* The second part, C += -TAU[i] * u * (transpose w), is just a GER. */
837
/* Each core can do its part independently. We are essentially */
838
/* dividing on M, not N, so every core needs (transpose w). */
840
/*----------------------------------------------------------------------------*/
843
if (i < (N-1) && myTAUi != 0.) /* If dlarf necessary, */
845
if (i < (N-1) ) /* If dlarf necessary, */
848
newN = N-i-1; /* Width of update array & vect w.*/
849
MY_gemvT(mUpdate, newN,
850
ONE, A+(LDA SHIFT), LDA, A, 1, ZERO, myTS->WORK, 1);
852
/*----------------------------------------------------------------------------*/
853
/* Now combine with other threads. */
854
/*----------------------------------------------------------------------------*/
855
zees = myRank; /* Init the test flags. */
856
pair = 1; /* Starting pair. */
857
while ( (zees & 1) == 0 && /* If I must wait on pair, */
859
ATL_NTHREADS) /* ..and pair exists, */
861
ptnr = myTS+pair; /* Point at my partner. */
862
if (ptnr->active == 1) /* If partner was used, */
864
while (ptnr->dgemv < i); /* Wait for it. */
865
ATL_axpy(newN, ONE, ptnr->WORK, 1, myTS->WORK, 1);
868
zees >>= 1; /* Shift a zero out. */
869
pair <<= 1; /* Double the pair idx. */
872
myTS->dgemv = i; /* Say I finished dgemv. */
874
/***********************************/
875
/****** S Y N C P O I N T ******/
876
/****** S Y N C P O I N T ******/
877
/****** S Y N C P O I N T ******/
878
/***********************************/
879
/*---------------------------------*/
880
/* We can't start GER until all of */
881
/* 'w' is finished. Wait for zero. */
882
/*---------------------------------*/
883
while (zTS->dgemv < i); /* Wait for zero to build 'w'. */
885
/*---------------------------------*/
886
/* 'w' now in WORK. Use for GER. */
887
/*---------------------------------*/
889
MY_ger(mUpdate, newN, 0.-myTAUi, A, 1, zTS->WORK, 1,
892
negTAUi[0]= 0.0 - myTAUi[0];
893
negTAUi[1]= 0.0 + myTAUi[1]; /* conjugate for complex */
895
MY_ger(mUpdate, newN, negTAUi, A, 1, zTS->WORK, 1,
899
/*-----------------------------------------------*/
900
/* Once we finish it is safe for us to start our */
901
/* next column and dnrm2 on our share. We will */
902
/* sync up with other threads to complete that. */
903
/*-----------------------------------------------*/
904
} /* END IF we needed to apply dlarf. */
907
if (myRank == 0) *A = AII; /* Core 0, restore diag now. */
911
*(A ) = AII[0]; /* Core 0, restore diag now. */
912
*(A + 1) = AII[1]; /* Core 0, restore diag now. */
917
* for computing T, for LQ replace myTAUi with correct TAU for the
923
myTAUi[1] = 0.0 - myTAUi[1];
927
#ifdef TREAL /* TODO change later */
928
if (myTS->buildT && i == 0) /* Simple store will work. */
929
*T = myTAUi; /* Just store it. */
931
if(myTS->buildT && i == 0) /* Simple store will work. */
938
if (myTS->buildT && i > 0) /* If I must work for T, */
940
/*----------------------------------------------------------------------------*/
941
/* Building T is very similar to DLARF, except we use the other */
942
/* side of A. Here is the picture: */
944
/* B R R R R R R R R Q = finished part of column */
945
/* Q B R R R R R R R R = upper triangular part for column */
946
/* Q Q B R R R R R R B = Betas (norm 2) stored on diagonal, */
947
/* Q Q Q 1 C C C C C 1 = forced unit (normally assumed) */
948
/* Q Q Q u C C C C C u = vector that H(i) is computed by, */
949
/* Q Q Q u C C C C C C = portion of panel to be updated. */
953
/* Q Q Q u C C C C C */
955
/* We must compute Q^T times u, so each thread does its part, and */
956
/* then we add them together. We don't know at this point if all */
957
/* threads have completed their dger, so we can't use WORK. But */
958
/* we can use WORK+N-i, because it is not in use at this time. So */
959
/* we build the vector in zTS->WORK+N-i, it is 'i' elements long, */
960
/* and then zero will copy that into the column T[0,i]. */
962
/* From there, we let thread 0 (alone) do a DTRMV to update that */
963
/* vector with the previous T, and then store TAU[i] at T[i,i]. */
965
/* This presumes that the DTRMV is too small to parallelize; but */
966
/* if that assumption is wrong the DTRMV could be parallelized as */
967
/* well, in future work. */
969
/* We must sync to add up the DGEMV for T, not after that. */
971
/* Rank 0: A points at A[i,i] mUpdate=M-i A-i*LDA=A[i,0] */
972
/* Rank X: A points at A[0,i] mUpdate=M A-i*LDA=A[0,0] */
973
/*----------------------------------------------------------------------------*/
975
if (myRank == 0) /* Special case... */
978
AII = *A; /* Save diagonal element. */
979
*A = 1.0; /* Force to 1. */
983
*(A ) = 1.0; /* Force to 1. */
984
*(A + 1) = 0.0; /* Force to 1. */
988
int os=(N+3)&(-4); /* Find even offset into work. */
991
MY_gemvT(mUpdate, i, 0.-myTAUi,
992
A-i*LDA, LDA, A, 1, 0.0, myTS->WORK+os, 1);
995
negTAUi[0]= 0.0 + myTAUi[0];
996
negTAUi[1]= 0.0 - myTAUi[1]; /* conj for cplx Not required */
998
negTAUi[0]= 0.0 - myTAUi[0];
999
negTAUi[1]= 0.0 - myTAUi[1]; /* conj for cplx Not required */
1001
MY_gemvT(mUpdate, i, negTAUi, A-(i*(LDA SHIFT)), LDA,
1002
A, 1, ZERO, myTS->WORK+(os SHIFT), 1);
1005
for (i_loop = 0; i_loop < i; i_loop++)
1007
(myTS->WORK+(os SHIFT))[(i_loop SHIFT) + 0] =
1008
0.0 - (myTS->WORK+(os SHIFT))[(i_loop SHIFT) + 0];
1013
/*----------------------------------------------------------------------------*/
1014
/* Now combine with other threads. */
1015
/*----------------------------------------------------------------------------*/
1016
zees = myRank; /* Init the test flags. */
1017
pair = 1; /* Starting pair. */
1018
while ( (zees & 1) == 0 && /* If I must wait on pair, */
1020
ATL_NTHREADS) /* ..and pair exists, */
1022
ptnr = myTS+pair; /* Point at my partner. */
1023
if (ptnr->active == 1) /* If partner was used, */
1025
while (ptnr->dgemvt < i); /* Wait for it. */
1026
ATL_axpy(i, ONE, ptnr->WORK+(os SHIFT), 1,
1027
myTS->WORK+(os SHIFT), 1);
1030
zees >>= 1; /* Shift a zero out. */
1031
pair <<= 1; /* Double the pair idx. */
1034
myTS->dgemvt = i; /* Post my completion. */
1036
/*----------------------------------------------------------------------------*/
1037
/* Done with dgemv part, rest is */
1038
/* all for thread 0 to get done. */
1039
/*----------------------------------------------------------------------------*/
1042
TYPE *src=zTS->WORK+(os SHIFT); /* Source vector. */
1043
TYPE *dst=T+(i*(LDT SHIFT)); /* Destination vector. */
1045
*A = AII; /* Restore saved value. */
1047
*(A) = AII[0]; /* Restore saved value. */
1048
*(A +1) = AII[1]; /* Restore saved value. */
1050
for (j=0; j<(i SHIFT); j++)
1052
*dst++ = *src++; /* Copy value. */
1055
cblas_trmv(CblasColMajor, CblasUpper, CblasNoTrans, CblasNonUnit,
1056
i, T, LDT, T+(i*(LDT SHIFT)), 1);
1058
/* Force TAU[i] on diagonal. */
1060
*(T+i+i*LDT)=myTAUi;
1062
*(T+(i SHIFT)+(i*(LDT SHIFT)) ) = myTAUi[0];
1063
*(T+(i SHIFT)+(i*(LDT SHIFT)) + 1) = myTAUi[1];
1066
} /* END IF zero must update T. */
1067
} /* END IF building T. */
1068
} /* END FOR each column. */
1071
/*----------------------------------------------------------------------------*/
1072
/* If we copied, this will copy back. */
1073
/*----------------------------------------------------------------------------*/
1076
ATL_geqr2_UnCopy(myTS); /* Do my copy back. */
1079
return(NULL); /* Implicit thread exit. */
1080
} /* END ATL_geqr2Worker() */
1083
/**************************************************************************** */
1084
/********************** M A S T E R C O N T R O L L E R ******************* */
1085
/**************************************************************************** */
1087
/*----------------------------------------------------------------------------*/
1088
/* The break-up: We divide on M, simply enough, in multiples of 4. If there */
1089
/* are less than 32*ATL_NTHREADS elements, we will not divide at all. */
1091
/* Processor 0 is our master combiner. The jobs are to compute the norm, */
1092
/* and then apply the reflector to the array. */
1094
/* To compute the norm, threads need SAFMIN. We compute it before any thread */
1095
/* launches if it is not already computed. */
1097
/*----------------------------------------------------------------------------*/
1099
#define ATL_tgexx2 Mjoin(PATL,tgelq2)
1101
#define ATL_tgexx2 Mjoin(PATL,tgeqr2)
1104
int ATL_tgexx2(int M, int N, TYPE *A, int LDA, TYPE *TAU, TYPE *WORK,
1105
TYPE *ws_T, int LDT, TYPE *WORKM, int buildT, int myCopy)
1108
ATL_DGEQR2_t ts[ATL_NTHREADS];
1111
// fprintf (stderr,"______in %s M=%d N=%d \n", Mstr(ATL_tgexx2), M,N);
1114
fprintf(stderr, "%s: M<0 (%i)\n", Mstr(ATL_tgexx2), M);
1120
fprintf(stderr, "%s: N<0 (%i)\n", Mstr(ATL_tgexx2), N);
1125
if (LDA < N) /* N is the original M */
1127
fprintf(stderr, "%s: LDA<M (%i, %i)\n", Mstr(ATL_tgexx2), LDA, M);
1133
fprintf(stderr, "%s: LDA<M (%i, %i)\n", Mstr(ATL_tgexx2), LDA, M);
1139
/*----------------------------------------------------------------------------*/
1140
/* Preliminaries are done; now split up the problem. */
1141
/* We use a data-owner split, each thread gets 1/p of the data, */
1142
/* and does all computation related to it. */
1143
/*----------------------------------------------------------------------------*/
1144
TYPE *myA = A, *myOldA = A, *allMem, *workMem;
1145
int i, j, k, b0, b, th;
1146
long unsigned int CPU;
1147
size_t mem[ATL_NTHREADS], totmem, workSize;
1149
th = ((M+N-1)/N); /* Max number of threads. */
1150
if (th==0) th=1; /* Avoid divide by zero. */
1151
if (th > ATL_NTHREADS) th=ATL_NTHREADS; /* Limit on top. */
1153
b0 = (M/th); /* Find part for th zero. */
1154
if (b0 < N) b0 = N; /* Take at least N. */
1156
if (th !=1) /* If multiple threads, */
1157
b = ((M-b0) / (th-1) ) & (-4); /* Split up the rest. */
1158
b0 = M - (th-1)*b; /* Leftovers go to b0. */
1159
if (b0 > b && b0 >= (N+(th-1)*4)) /* If b0 is biggest and can be smaller, */
1161
b += 4; /* Make the others slightly bigger. */
1162
b0 -= (th-1)*4; /* Core 0 has more overhead, do less. */
1166
if (th == 1 || (N> M)) /* If impossible to split, use serial. */
1170
ATL_gelq2(N, M, A, LDA, TAU, WORK); /* Use serial version. */
1173
ATL_larft(LAForward, LARowStore, M, N, A, LDA, TAU, ws_T, LDT);
1176
ATL_geqr2(M, N, A, LDA, TAU, WORK);
1179
ATL_larft(LAForward, LAColumnStore, M, N, A, LDA, TAU, ws_T, LDT);
1183
return(0); /* Exit after panel. */
1186
/*----------------------------------------------------------------------------*/
1187
/* Fill out the thread work areas. */
1188
/*----------------------------------------------------------------------------*/
1189
for (i=0; i<ATL_NTHREADS; i++)
1191
ts[i].active=0; /* Nobody is active yet. */
1192
mem[i] = 0; /* Nobody needs memory. */
1195
ts[0].fullM = M; /* Need full size M. */
1196
ts[0].fullN = N; /* Need full size N. */
1197
ts[0].myM = b0; /* Core 0 is special. */
1198
ts[0].myN = N; /* Width is same for all. */
1199
ts[0].myK = 0; /* First k for dnrm2. */
1200
ts[0].lda = LDA; /* LDA is same for all. */
1201
ts[0].rank = 0; /* Rank used by core 0. */
1202
ts[0].A = myA; /* Core 0 gets top of A. */
1203
ts[0].TAU = TAU; /* TAU is same for all. */
1204
ts[0].dnrm2 = -1; /* Not done yet. */
1205
ts[0].dgemv = -1; /* Not done yet. */
1206
ts[0].active = 1; /* We are active. */
1207
ts[0].buildT = buildT; /* Pass in decision var. */
1208
ts[0].T = ws_T; /* Pass in matrix addr. */
1209
ts[0].LDT = LDT; /* And leading dimension. */
1210
ts[0].dgemvt = -1; /* Not done yet. */
1211
ts[0].copy = myCopy; /* Whether PCA should copy. */
1213
myA += (b0 SHIFT)*LDA; /* Point at next A, LQ. */
1215
myA += (b0 SHIFT); /* Point at next A, QR. */
1218
for (i=1; i < th; i++)
1220
ts[i].fullM = b; /* Remember whole M. */
1221
ts[i].fullN = N; /* Need full size N. */
1222
ts[i].myM = b; /* 'b' entries for all. */
1223
ts[i].myN = N; /* Width is same for all. */
1224
ts[i].myK = 0; /* First k for dnrm2. */
1225
ts[i].lda = LDA; /* LDA is same for all. */
1226
ts[i].rank = i; /* Rank of process. */
1227
ts[i].A = myA; /* Point at share of A. */
1228
ts[i].TAU = TAU; /* TAU is same for all. */
1229
ts[i].dnrm2 = -1; /* Not done yet. */
1230
ts[i].dgemv = -1; /* Not done yet. */
1231
ts[i].active = 1; /* Indicate active. */
1232
ts[i].buildT = buildT; /* Pass in decision var. */
1233
ts[i].dgemvt = -1; /* Not done yet. */
1234
ts[i].copy = myCopy; /* Whether PCA should copy. */
1236
myA += (b SHIFT)*LDA; /* Point at next share, LQ. */
1238
myA += (b SHIFT); /* Point at next share, QR. */
1242
/*----------------------------------------------------------------------------*/
1243
/* Deal with memory. */
1244
/*----------------------------------------------------------------------------*/
1247
totmem=MY_align; /* Needed for alignment. */
1248
for (i=0; i<th; i++)
1250
mem[i] = ATL_geqr2_LC_Setup(ts+i); /* Find necessary memory. */
1251
totmem += mem[i]; /* Add to total. */
1254
allMem = malloc(totmem); /* Allocate the memory. */
1256
ts[0].A = (TYPE*) (((size_t) allMem+MY_align)&(-MY_align));
1257
for (i=1; i<th; i++) /* Each thread takes.. */
1259
ts[i].A = (TYPE*) ((size_t) ts[i-1].A + mem[i-1]); /* ..next block. */
1263
workSize = ((2 SHIFT)*(N+4)*sizeof(TYPE) +
1264
MY_align-1)&(-MY_align); /* aligned. */
1265
totmem = MY_align + workSize*ATL_NTHREADS; /* Find mem to alloc. */
1266
workMem = malloc(totmem);
1267
ts[0].WORK = (TYPE*) (((size_t) workMem + MY_align-1)&(-MY_align));
1268
for (i=1; i<th; i++)
1269
ts[i].WORK = (TYPE*) ((size_t) ts[i-1].WORK + workSize);
1271
/*----------------------------------------------------------------------------*/
1272
/* Call ATL_launcher to launch thread which runs in different CPUs cores */
1273
/*----------------------------------------------------------------------------*/
1275
ATL_goparallel(th, ATL_geqr2Worker, ts, NULL);
1278
#if defined(local_copy)
1279
free(allMem); /* release copied area. */
1280
#endif /* defined(local_copy) */
1281
free(workMem); /* release work area. */
1282
return(0); /* Done with dgeqr2. */
1283
} /* END ATL_t_dgeqr2. */