1
/* ---------------------------------------------------------------------
3
* -- Automatically Tuned Linear Algebra Software (ATLAS)
4
* (C) Copyright 2000 All Rights Reserved
6
* -- ATLAS routine -- Version 3.2 -- December 25, 2000
8
* Author : Antoine P. Petitet
9
* Originally developed at the University of Tennessee,
10
* Innovative Computing Laboratory, Knoxville TN, 37996-1301, USA.
12
* ---------------------------------------------------------------------
14
* -- Copyright notice and Licensing terms:
16
* Redistribution and use in source and binary forms, with or without
17
* modification, are permitted provided that the following conditions
20
* 1. Redistributions of source code must retain the above copyright
21
* notice, this list of conditions and the following disclaimer.
22
* 2. Redistributions in binary form must reproduce the above copyright
23
* notice, this list of conditions, and the following disclaimer in
24
* the documentation and/or other materials provided with the distri-
26
* 3. The name of the University, the ATLAS group, or the names of its
27
* contributors may not be used to endorse or promote products deri-
28
* ved from this software without specific written permission.
32
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
33
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
34
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
35
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY
36
* OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPE-
37
* CIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
38
* TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
39
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEO-
40
* RY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (IN-
41
* CLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
42
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
* ---------------------------------------------------------------------
49
#include "atlas_ptmisc.h"
51
DIM_TZSPLIT_T ATL_tzsplit
53
const enum ATLAS_UPLO UPLO,
54
const unsigned int NT,
69
* ATL_tzsplit splits a trapezoidal matrix into two matrices of the same
70
* kind. The initial input matrix sizes are specified by M, N and K:
73
* ______________ _____________
76
* | __|__________| |____________|__\
79
* \|__________| |____________|__|
81
* UPLO = AtlasLower UPLO = AtlasUpper
83
* This function returns which dimension should be split if any, as well
84
* as the number of threads to be used for each submtrix.
89
* UPLO (input) const enum ATLAS_UPLO
90
* On entry, UPLO specifies whether the matrix is upper or
93
* NT (input) const unsigned int
94
* On entry, NT specifies the initial total number of threads.
97
* On entry, M specifies the number of complete rows of the
100
* N (input) const int
101
* On entry, N specifies the number of complete columns of the
102
* trapezoidal matrix.
104
* K (input) const int
105
* On entry, K specifies the size of the triangular submatrix.
107
* NB (input) const int
108
* On entry, NB specifies the blocking factor to be used along
111
* NT1 (output) unsigned int *
112
* On exit, NT1 specifies the number of threads to be used for
113
* the first part of the problem size MNK1.
115
* NT2 (output) unsigned int *
116
* On exit, NT2 specifies the number of threads to be used for
117
* the second part of the problem size MNK2.
119
* MNK1 (output) int *
120
* On exit, MNK1 specifies the length of the problem size to be
121
* run on the NT1 threads.
123
* MNK1 (output) int *
124
* On exit, MNK2 specifies the length of the problem size to be
125
* run on the NT2 threads.
127
* ---------------------------------------------------------------------
130
* .. Local Variables ..
133
double gacol=0.0, garow=0.0, k, k2, m, mnk1, mnk2,
134
n, nt, nt1, nt2, p, r, rcol=0.0, rrow=0.0,
135
s, tmnk1, tmnk2, work;
136
DIM_TZSPLIT_T split = AtlasTzNoSplit;
137
int kblks, mblks, nblks, nbm1, ntm1;
139
* .. Executable Statements ..
143
mblks = (M+nbm1) / NB; nblks = (N+nbm1) / NB; kblks = (K+nbm1) / NB;
144
r = (double)(mblks + kblks) * (double)(kblks + nblks);
146
if( ( r < 4.0 ) || ( NT < 2 ) ) return( split );
148
m = (double)(M); n = (double)(N); k = (double)(K);
149
k2 = k * k; nt = (double)(NT); ntm1 = NT - 1;
150
work = 2.0 * ( ( k + m ) * n + ( k * m ) ) + k2;
152
if( ( M > 0 ) || ( K > 0 ) ) /* Column split */
156
gacol = ( 2.0 * nt * n * s - work ) / ( nt * s );
158
if( ( N > 1 ) && ( gacol >= 0.0 ) )
160
mnk1 = ( ( N - (int)(gacol / 2.0) + nbm1 ) / NB ) * NB;
161
mnk1 = Mmin( mnk1, N );
162
mnk1 = Mmax( mnk1, 1 );
165
tmnk1 = (double)(mnk1);
166
tmnk2 = (double)(mnk2);
168
nt1 = (int)floor( ( ( 2 * s * tmnk1 ) / work ) * nt + 0.5 );
169
nt2 = NT - ( nt1 = Mmin( nt1, ntm1 ) );
171
rcol = ( ( 2.0 * s * tmnk1 ) / (double)(nt1) ) -
172
( ( 2.0 * ( s * tmnk2 + p ) + k2 ) / (double)(nt2) );
174
if( UPLO == AtlasLower )
176
mnks[0] = mnk1; mnks[1] = mnk2;
177
nts [0] = nt1; nts [1] = nt2;
181
mnks[0] = mnk2; mnks[1] = mnk1;
182
nts [0] = nt2; nts [1] = nt1;
187
mnk1 = ( ( (int)(s * ( 1.0 - sqrt( 1.0 + ( gacol / s ) ) )) +
189
mnk1 = Mmin( mnk1, K );
190
mnk1 = Mmax( mnk1, 1 );
193
tmnk1 = (double)(mnk1);
194
tmnk2 = (double)(mnk2);
196
r = work - ( tmnk2 * ( tmnk2 + ( 2.0 * m ) ) );
197
nt1 = (int)floor( ( r / work ) * nt + 0.5 );
198
nt2 = NT - ( nt1 = Mmin( nt1, ntm1 ) );
200
rcol = 2.0 * ( s * n + tmnk1 * ( m + tmnk2 ) ) +
202
rcol /= (double)(nt1);
203
rcol -= ( tmnk2 * ( 2.0 * m + tmnk2 ) ) / (double)(nt2);
205
if( UPLO == AtlasLower )
207
mnks[2] = mnk1; mnks[3] = mnk2;
208
nts [2] = nt1; nts [3] = nt2;
212
mnks[2] = mnk2; mnks[3] = mnk1;
213
nts [2] = nt2; nts [3] = nt1;
218
if( ( N > 0 ) || ( K > 0 ) )
222
garow = ( 2.0 * nt * m * s - work ) / ( nt * s );
224
if( ( M > 1 ) && ( garow >= 0.0 ) )
226
mnk1 = ( ( M - (int)(garow / 2.0) + nbm1 ) / NB ) * NB;
227
mnk1 = Mmin( mnk1, M );
228
mnk1 = Mmax( mnk1, 1 );
231
tmnk1 = (double)(mnk1);
232
tmnk2 = (double)(mnk2);
234
nt1 = (int)floor( ( ( 2 * s * tmnk1 ) / work ) * nt + 0.5 );
235
nt2 = NT - ( nt1 = Mmin( nt1, ntm1 ) );
237
rrow = ( ( 2.0 * s * tmnk1 ) / (double)(nt1) ) -
238
( ( 2.0 * ( s * tmnk2 + p ) + k2 ) / (double)(nt2) );
240
if( UPLO == AtlasUpper )
242
mnks[4] = mnk1; mnks[5] = mnk2;
243
nts [4] = nt1; nts [5] = nt2;
247
mnks[4] = mnk2; mnks[5] = mnk1;
248
nts [4] = nt2; nts [5] = nt1;
253
mnk1 = ( ( (int)(s * ( 1.0 - sqrt( 1.0 + ( garow / s ) ) )) +
255
mnk1 = Mmin( mnk1, K );
256
mnk1 = Mmax( mnk1, 1 );
259
tmnk1 = (double)(mnk1);
260
tmnk2 = (double)(mnk2);
262
r = work - ( tmnk2 * ( tmnk2 + ( 2.0 * n ) ) );
263
nt1 = (int)floor( ( r / work ) * nt + 0.5 );
264
nt2 = NT - ( nt1 = Mmin( nt1, ntm1 ) );
266
rrow = 2.0 * ( s * m + tmnk1 * ( n + tmnk2 ) ) +
268
rrow /= (double)(nt1);
269
rrow -= ( tmnk2 * ( 2.0 * n + tmnk2 ) ) / (double)(nt2);
271
if( UPLO == AtlasUpper )
273
mnks[6] = mnk1; mnks[7] = mnk2;
274
nts [6] = nt1; nts [7] = nt2;
278
mnks[6] = mnk2; mnks[7] = mnk1;
279
nts [6] = nt2; nts [7] = nt1;
284
if( ( ( M > 0 ) || ( K > 0 ) ) && ( ( K > 0 ) || ( N > 0 ) ) )
286
if( Mabs( rcol ) <= Mabs( rrow ) ) /* Column split */
288
if( ( N > 1 ) && ( gacol >= 0.0 ) )
290
split = AtlasTzSplitNcol;
291
*NT1 = nts [0]; *NT2 = nts [1];
292
*MNK1 = mnks[0]; *MNK2 = mnks[1];
296
split = AtlasTzSplitKcol;
297
*NT1 = nts [2]; *NT2 = nts [3];
298
*MNK1 = mnks[2]; *MNK2 = mnks[3];
303
if( ( M > 1 ) && ( garow >= 0.0 ) )
305
split = AtlasTzSplitMrow;
306
*NT1 = nts [4]; *NT2 = nts [5];
307
*MNK1 = mnks[4]; *MNK2 = mnks[5];
311
split = AtlasTzSplitKrow;
312
*NT1 = nts [6]; *NT2 = nts [7];
313
*MNK1 = mnks[6]; *MNK2 = mnks[7];
317
else if( ( M > 0 ) || ( K > 0 ) )
319
if( ( N > 1 ) && ( gacol >= 0.0 ) )
321
split = AtlasTzSplitNcol;
322
*NT1 = nts [0]; *NT2 = nts [1];
323
*MNK1 = mnks[0]; *MNK2 = mnks[1];
327
split = AtlasTzSplitKcol;
328
*NT1 = nts [2]; *NT2 = nts [3];
329
*MNK1 = mnks[2]; *MNK2 = mnks[3];
332
else if( ( N > 0 ) || ( K > 0 ) )
334
if( ( M > 1 ) && ( garow >= 0.0 ) )
336
split = AtlasTzSplitMrow;
337
*NT1 = nts [4]; *NT2 = nts [5];
338
*MNK1 = mnks[4]; *MNK2 = mnks[5];
342
split = AtlasTzSplitKrow;
343
*NT1 = nts [6]; *NT2 = nts [7];
344
*MNK1 = mnks[6]; *MNK2 = mnks[7];
349
split = AtlasTzNoSplit;