2
* Automatically Tuned Linear Algebra Software v3.10.1
3
* Copyright (C) 2010, 2009 R. Clint Whaley
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions, and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
13
* 3. The name of the ATLAS group or the names of its contributers may
14
* not be used to endorse or promote products derived from this
15
* software without specific written permission.
17
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
21
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
* POSSIBILITY OF SUCH DAMAGE.
34
#include "atlas_misc.h"
35
#include Mstr(Mjoin(Mjoin(atlas_,PRE),sysinfo.h))
37
static char *resfile=NULL;
38
static FILE *fpres=NULL;
46
struct FA_allocs *next;
49
struct FA_allocs *NewAlloc(size_t size, struct FA_allocs *next,
50
int align, int misalign)
52
* Allocates size allocation that is aligned to [align], but not aligned
53
* to [misalign]. Therefore, misalign > align. Align must minimally be sizeof
54
* while misalign may be 0 if we don't need to avoid a particular alignment.
61
const int malign = align >= misalign ? align : misalign;
63
n = size + align + align + malign;
67
cp = malloc(n + sizeof(struct FA_allocs));
69
ap = (struct FA_allocs *) (cp + n);
72
* Align to min alignment
74
ap->memA = align ? (void*) ((((size_t) cp)/align)*align + align) : cp;
76
* Misalign to misalign
77
* We often need to make sure to unaligned addresses share the same modulo
78
* so that they have the *same* degree of misalignment (so that their alignment
79
* can be fixed by simple peeling), and so in this case force the address
80
* modulo the misalign to be the exact align value.
83
ap->memA = (void*)((((size_t)ap->memA)/malign)*malign + malign + align);
89
* no-align malloc free retaining system default behavior
91
void *NA_malloc(size_t size)
95
void *NA_calloc(size_t n, size_t size)
97
return(calloc(n, size));
99
void NA_free(void *ptr)
106
* malloc/free pair that aligns data to align, but not to misalign
108
void *FA_malloc(size_t size, int align, int misalign)
110
if ((!misalign && align <= 8) || !size)
111
return(malloc(size));
114
allocQ = NewAlloc(size, allocQ, align, misalign);
115
return(allocQ->memA);
118
void *FA_calloc(size_t n, size_t size, int align, int misalign)
126
cp = FA_malloc(tsize, align, misalign);
127
if (size == sizeof(int))
128
for (ip=(int*)cp,i=0; i < n; i++)
130
else if (size == sizeof(double))
131
for (dp=(double*)cp,i=0; i < n; i++)
134
for (i=0; i < tsize; i++)
139
void FA_free(void *ptr, int align, int misalign)
141
* Part of malloc/free pair that aligns data to FALIGN
144
struct FA_allocs *ap, *prev;
147
if ((!misalign && align <= 8))
151
for (ap=allocQ; ap && ap->memA != ptr; ap = ap->next) prev = ap;
154
fprintf(stderr, "Couldn't find mem=%ld\nmemQ=\n", (size_t)ptr);
155
for (ap=allocQ; ap; ap = ap->next)
156
fprintf(stderr, " %ld, %ld\n", (size_t)ap->memA,
161
allocQ = allocQ->next;
163
prev->next = ap->next;
168
#include "atlas_mvtesttime.h"
171
void ATL_UGEMV(ATL_CINT M, ATL_CINT N, const TYPE *A, ATL_CINT lda,
172
const TYPE *X, TYPE *Y);
175
void Mjoin(PATL,gemvT)
176
(ATL_CINT M, ATL_CINT N, const SCALAR alpha, const TYPE *A,
177
ATL_CINT lda, const TYPE *X, ATL_CINT incX,
178
const SCALAR beta, TYPE *Y, ATL_CINT incY);
180
#define test_gemv(M, N, alpha, A, lda, X, incX, beta, Y, incY) \
181
Mjoin(PATL,gemvT)(M, N, *alpha, A, lda, X, incX, *beta, Y, incY)
183
#define test_gemv(M, N, alpha, A, lda, X, incX, beta, Y, incY) \
184
Mjoin(PATL,gemvT)(M, N, alpha, A, lda, X, incX, beta, Y, incY)
187
void Mjoin(PATL,gemvN)
188
(ATL_CINT M, ATL_CINT N, const SCALAR alpha, const TYPE *A,
189
ATL_CINT lda, const TYPE *X, ATL_CINT incX,
190
const SCALAR beta, TYPE *Y, ATL_CINT incY);
192
#define test_gemv(M, N, alpha, A, lda, X, incX, beta, Y, incY) \
193
Mjoin(PATL,gemvN)(M, N, *alpha, A, lda, X, incX, *beta, Y, incY)
195
#define test_gemv(M, N, alpha, A, lda, X, incX, beta, Y, incY) \
196
Mjoin(PATL,gemvN)(M, N, alpha, A, lda, X, incX, beta, Y, incY)
201
double Time2Flop(ATL_INT M, ATL_INT N, double time)
203
if (time == 0.0 || time != time)
206
return(((1.0e-6 * M)*(2.0*N+1.0))/time);
208
return((((6.0*M)*(N+1.0) + (2.0*M)*N)*1.0e-6)/time);
212
void Times2Flops(ATL_INT M, ATL_INT N, ATL_INT ntim, double *mf)
214
* Converts time to MFLOP
219
for (i=0; i < ntim; i++)
220
mf[i] = Time2Flop(M, N, mf[i]);
223
static double mysum(ATL_CINT N, double *d)
229
for (i=1; i < N; i++)
243
int verb, /* verbosity */
244
int nreps, /* number of reps to do for one timing sample */
245
size_t flushelts, /* size of area to flush to avoid cache reuse */
246
ATL_INT M, /* # of rows of array A */
247
ATL_INT N, /* # of cols of array A */
250
ATL_INT lda, /* leading dim */
251
int incX, /* ignored, assumed 1 by definition */
252
int incY, /* increment on Y; can be non-1 */
253
int FAa, /* if (FA. = 0) enforce no alignment */
254
int MAa, /* else force op to be aligned to at least FA bytes */
255
int FAx, /* if MA. != 0, disallow op to be aligned to MA. byts */
260
* This function directly calls the kernel routine on data that has been
261
* preloaded (through initialization) to any cache large enough to hold it.
269
A = FA_malloc(ATL_MulBySize(lda)*N, FAa, MAa);
270
X = FA_malloc(ATL_MulBySize(NX), FAx, MAx);
271
Y = FA_malloc(ATL_MulBySize(NY*incY), FAy, MAy);
272
ATL_assert(A && X && Y);
274
Mjoin(PATL,gegen)(1, NY, Y, incY, M);
275
Mjoin(PATL,gegen)(NX, 1, X, NX, N+127*50+77);
276
Mjoin(PATL,gegen)(M, N, A, lda, N*M+513*7+90);
278
* NOTE: if nreps too high this could lead to under/overflow
280
for (k=0; k < 8; k++) /* loop until results are believable, or give up */
283
for (i=nreps; i; i--)
285
ATL_UGEMV(M, N, A, lda, X, Y);
289
nreps = (nreps) ? nreps+nreps : 1;
291
t1 = (t1 - t0)/(1.0*nreps);
293
fprintf(stdout, " M=%d, N=%d, lda=%d, nreps=%d, time=%e, mflop=%.2f\n",
294
M, N, lda, nreps, t1, Time2Flop(M, N, t1));
295
FA_free(A, FAa, MAa);
296
FA_free(X, FAx, MAx);
297
FA_free(Y, FAy, MAy);
302
int verb, /* verbosity */
303
int nreps, /* number of reps to do for one timing sample */
304
ATL_INT flushelts, /* size of area to flush to avoid cache reuse */
305
ATL_INT M, /* # of rows of array A */
306
ATL_INT N, /* # of cols of array A */
309
ATL_INT lda, /* leading dim */
310
int mu, /* unrolling on M */
311
int nu, /* unrolling on N */
312
int FAa, /* if (FA. = 0) enforce no alignment */
313
int MAa, /* else force op to be aligned to at least FA bytes */
314
int FAx, /* if MA. != 0, disallow op to be aligned to MA. bytes*/
319
* Times the kernel for out-of-cache (where flushelts sets the cache that it
320
* is not allowed to be in) use.
321
* RETURNS: elapsed time in seconds to average repitition of indicated problem.
326
TYPE nbeta[1] = {-beta[0]}, *be=beta;
328
TYPE NONE[2] = {-1.0, 0.0};
329
TYPE nbeta[2] = {-beta[0], -beta[1]}, *be=beta;
332
TYPE *A, *X, *Y, *a, *x, *y;
334
size_t Aelts, Xelts, Yelts, setspan, ygap, xgap, agap, pregap, setsz, nsets;
346
* Find basic length of each operand in elements
357
* Map memory so that we can enforce all required alignments while moving
358
* through memory; mem starts with maxalign-aligned memory, so that we can
359
* guarantee all further alignments
361
maxalign = (MAx) ? MAx : 1;
363
maxalign = ATL_lcm(MAy,maxalign);
365
maxalign = ATL_lcm(MAa,maxalign);
367
maxalign = ATL_lcm(FAx,maxalign);
369
maxalign = ATL_lcm(FAy,maxalign);
371
maxalign = ATL_lcm(FAa,maxalign);
374
j = (FAx) ? FAx : sizeof(TYPE);
376
for (i=0; (i % j != 0 || i%MAx == 0); i += sizeof(TYPE));
378
for (i=0; i % j != 0 ; i += sizeof(TYPE));
382
xgap = ATL_MulBySize(Xelts);
385
j = (FAy) ? FAy : sizeof(TYPE);
387
for (i=pregap+xgap; (i%j != 0 || i%MAy == 0); i += sizeof(TYPE));
389
for (i=pregap+xgap; (i%j != 0); i += sizeof(TYPE));
392
ygap = ATL_MulBySize(Yelts);
395
j = (FAa) ? FAa : sizeof(TYPE);
397
for (i=pregap+xgap+ygap; (i%j != 0 || i%MAa == 0); i += sizeof(TYPE));
399
for (i=pregap+xgap+ygap; (i%j != 0); i += sizeof(TYPE));
400
ygap = i - pregap - xgap;
402
agap = ATL_MulBySize(Aelts);
407
for (i=pregap+xgap+ygap+agap; i%maxalign != 0; i++);
408
agap = i-pregap-xgap-ygap;
410
setspan = pregap + xgap + ygap + agap;
411
assert(setspan%sizeof(TYPE) == 0);
412
setsz = ATL_MulBySize(M+N+M*N);
413
nsets = (ATL_MulBySize(flushelts)+setsz-1)/setsz;
416
vmem = malloc(maxalign + nsets*setspan);
418
if (maxalign) /* start maxaligned to guarantee all alignments */
419
for (ptr_st = (size_t)vmem; ptr_st%maxalign; ptr_st++);
420
else ptr_st = (size_t) vmem;
421
X = (TYPE*) (ptr_st + pregap);
422
Y = (TYPE*) (ptr_st + pregap + xgap);
423
A = (TYPE*) (ptr_st + pregap + xgap + ygap);
425
* Set ptrs to last set in memory
427
setspan /= sizeof(TYPE);
428
a = A += (nsets-1) * setspan;
429
x = X += (nsets-1) * setspan;
430
y = Y += (nsets-1) * setspan;
431
for (i=nsets; i; i--)
436
assert(((size_t)a)%FAa == 0);
438
assert(((size_t)x)%FAx == 0);
440
assert(((size_t)y)%FAy == 0);
442
assert(((size_t)a)%MAa != 0);
444
assert(((size_t)x)%MAx != 0);
446
assert(((size_t)y)%MAy != 0);
448
Mjoin(PATL,gegen)(Yelts, 1, y, Yelts, M);
449
Mjoin(PATL,gegen)(Xelts, 1, x, Xelts, N+127*50+77);
451
Mjoin(PATL,scal)(Xelts, NONE, x, 1);
452
Mjoin(PATL,gegen)(M, N, A, lda, N*M+513*7+90);
453
a -= setspan; x -= setspan; y -= setspan;
458
for (k=0; k < 8; k++) /* loop until good timing or too many trips */
461
for (i=nreps; i; i--)
463
test_gemv(M, N, alpha, A, lda, x, 1, be, y, 1);
464
if (++j < nsets) { a -= setspan; x -= setspan; y -= setspan; }
465
else { a = A; x = X; y = Y; j=0; be = (be == beta) ? nbeta : beta;}
470
nreps = (nreps) ? nreps+nreps : 1;
473
t1 = (t1-t0) / (1.0*nreps);
475
fprintf(stdout, " M=%d, N=%d, lda=%d, nreps=%d, time=%e, mflop=%.2f\n",
476
M, N, lda, nreps, t1, Time2Flop(M, N, t1));
481
void DoTimes(int verb, size_t flshelts, ATL_INT ntim, ATL_INT nrep,
482
ATL_INT mu, ATL_INT nu, ATL_INT M, ATL_INT N, TYPE *alpha,
483
TYPE *beta, ATL_INT lda,
484
int FAa, int MAa, int FAx, int MAx, int FAy, int MAy)
489
times = malloc(ntim * sizeof(double));
493
fprintf(stdout, "GEMV: M=%d, N=%d, lda=%d, AF=[%d,%d,%d], AM=[%d,%d,%d], beta=%e, alpha=%e:\n",
494
M, N, lda, FAa, FAx, FAy, MAa, MAx, MAy, *beta, *alpha);
496
fprintf(stdout, "GEMV: M=%d, N=%d, lda=%d, AF=[%d,%d,%d], AM=[%d,%d,%d], beta=[%e,%e], alpha=[%e,%e]:\n",
497
M, N, lda, FAa, FAx, FAy, MAa, MAx, MAy,
498
*beta, beta[1], *alpha, alpha[1]);
500
for (i=0; i < ntim; i++)
501
times[i] = mvtime(verb, nrep, flshelts, M, N, alpha, beta, lda,
502
mu, nu, FAa, MAa, FAx, MAx, FAy, MAy);
503
SortDoubles(ntim, times);
504
Times2Flops(M, N, ntim, times);
507
#if defined(PentiumCPS) || defined(WALL)
508
fprintf(fpres, "%d 1\n", ntim);
510
fprintf(fpres, "%d 0\n", ntim);
512
for (i=0; i < ntim; i++)
513
fprintf(fpres, "%le\n", times[i]);
516
fprintf(stdout, "NREPS=%d, MAX=%.2f, MIN=%.2f, AVG=%.2f, MED=%.2f\n",
517
ntim, times[0], times[ntim-1], mysum(ntim, times)/ntim,
522
void PrintUsage(char *name, char *arg, int i)
525
fprintf(stderr, "BAD ARG '%s' on %dth FLAG\n", arg, i);
526
fprintf(stderr, "USAGE: %s [flags], where flags are:\n", name);
527
fprintf(stderr, " -v <#> : set verbosity level\n");
528
fprintf(stderr, " -C <#> : set flushsz = # (kbytes)\n");
529
fprintf(stderr, " -x <#> : unrolling for X in kernel is #\n");
530
fprintf(stderr, " -y <#> : unrolling for Y in kernel is #\n");
531
fprintf(stderr, " -m <#> : set # of rows of matrix to #\n");
532
fprintf(stderr, " -n <#> : set # of cols of matrix to #\n");
533
fprintf(stderr, " -l <#> : set leading dimension of array A to #\n");
534
fprintf(stderr, " -F <#> : do at least # MFLOPS for each timing interval\n");
535
fprintf(stderr, " -f <file> : output timing summary in <file>; if file exists read & report\n");
537
" -r <#> : do # repetitions of the call for each timing interval\n");
539
" -# <#> : report # timings (each interval may have multiple calls)\n");
541
" -F[x,y,a] <#> : if(# > 0) -> force op to be aligned to at least # bytes\n");
543
" if(# < 0) -> force op to be aligned to < # bytes.\n");
544
fprintf(stderr, " -b <beta> : 2 floats for complex, one for real.\n");
548
void GetFlags(int nargs, char **args, int *verb,
549
size_t *flushelts, ATL_INT *celts, ATL_INT *pgelts,
550
ATL_INT *mu, ATL_INT *nu, ATL_INT *ntim, ATL_INT *nrep,
551
enum ATLAS_TRANS *TA, ATL_INT *m, ATL_INT *n, ATL_INT *lda,
553
int *FAa, int *MAa, int *FAx, int *MAx, int *FAy, int *MAy)
555
double mfF=ATL_nkflop/1000.0, flops;
563
*pgelts = ATL_DivBySize(ATL_PAGESZ);
565
*pgelts = 4*ATL_DivBySize(1024);
567
*celts = 0.75*ATL_L1elts;
571
*flushelts = 8*1024*ATL_DivBySize(1024);
578
*FAa = *MAa = *FAx = *MAx = *FAy = *MAy = 0;
584
for (i=1; i < nargs; i++)
586
if (args[i][0] != '-')
587
PrintUsage(args[0], "No '-' preceeding flag!", i);
590
case 'f' : /* set resfile output */
592
PrintUsage(args[0], "out of flags in -f ", i-1);
595
case 'v' : /* set verbosity */
597
PrintUsage(args[0], "out of flags in -v ", i-1);
598
*verb = atoi(args[i]);
600
case 'G' : /* set GEMV blocking cache size in KB */
602
PrintUsage(args[0], "out of flags in -G ", i-1);
604
*celts = j*ATL_DivBySize(1024);
606
case 'A' : /* set transpose */
608
PrintUsage(args[0], "out of flags in -A ", i-1);
610
if (ch == 't' || ch == 'T')
612
else if (ch == 'c' || ch == 'C')
613
*TA = AtlasConjTrans;
614
else if (ch == 'z' || ch == 'Z')
619
case 'C' : /* set flushsz in KB */
621
PrintUsage(args[0], "out of flags in -C ", i-1);
624
*flushelts = il*ATL_DivBySize(1024);
626
case 'p' : /* set pagesz in KB */
628
PrintUsage(args[0], "out of flags in -p ", i-1);
630
*pgelts = j*ATL_DivBySize(1024);
632
case 'x' : /* set mu */
634
PrintUsage(args[0], "out of flags in -x ", i-1);
637
case 'y' : /* set nu */
639
PrintUsage(args[0], "out of flags in -y ", i-1);
642
case 'm' : /* set M */
644
PrintUsage(args[0], "out of flags in -m ", i-1);
647
case 'n' : /* set N */
649
PrintUsage(args[0], "out of flags in -n ", i-1);
652
case 'l' : /* set lda */
654
PrintUsage(args[0], "out of flags in -l ", i-1);
655
*lda = atoi(args[i]);
657
case 'a' : /* alias for setting alpha in r1ktime */
658
case 'b' : /* set beta */
660
PrintUsage(args[0], "out of flags in -b ", i-1);
661
*beta = atof(args[i]);
664
PrintUsage(args[0], "out of flags in -b ", i-1);
665
beta[1] = atof(args[i]);
668
case 'F' : /* set nrep by specifying MFLOPS, or force alignment */
670
if (ch == '\0') /* specifying MFLOPS */
673
PrintUsage(args[0], "out of flags in -F ", i-1);
679
if (ch != 'a' && ch != 'y' && ch != 'x')
680
PrintUsage(args[0], args[i], i);
682
PrintUsage(args[0], args[i-1], i-1);
704
case 'r' : /* set nrep directly as integer */
706
PrintUsage(args[0], "out of flags in -r ", i-1);
707
*nrep = atoi(args[i]);
709
case '#' : /* set number of timings to report */
711
PrintUsage(args[0], "out of flags in -# ", i-1);
712
*ntim = atoi(args[i]);
715
PrintUsage(args[0], args[i], i);
720
flops = Time2Flop(*m, *n, 1.0) * 1000.0; /* Get kiloFLOPS in GEMV */
721
*nrep = (mfF+flops-1)/flops;
722
if (*nrep < 1) *nrep = 1;
727
int main(int nargs, char **args)
730
ATL_INT celts, pgelts, mu, nu, ntim, nrep, m, n, lda;
731
int FAa, MAa, FAx, MAx, FAy, MAy; /* Force & Max align for ops */
741
GetFlags(nargs, args, &verb, &flushelts, &celts, &pgelts, &mu, &nu, &ntim,
742
&nrep, &TA, &m, &n, &lda, SADD beta,
743
&FAa, &MAa, &FAx, &MAx, &FAy, &MAy);
746
dres = ReadResultsFile(1, ntim, resfile);
749
fprintf(stdout, "TIMINGS READ IN FROM '%s':\n", resfile);
750
PrintResultsFromFile(stdout, dres);
754
fpres = fopen(resfile, "w");
757
DoTimes(verb, flushelts, ntim, nrep, mu, nu, m, n, SADD beta, SADD beta, lda,
758
FAa, MAa, FAx, MAx, FAy, MAy);