2
* Automatically Tuned Linear Algebra Software v3.10.1
3
* Copyright (C) 2010 R. Clint Whaley
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions, and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
13
* 3. The name of the ATLAS group or the names of its contributers may
14
* not be used to endorse or promote products derived from this
15
* software without specific written permission.
17
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
21
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
* POSSIBILITY OF SUCH DAMAGE.
30
#include "atlas_misc.h"
31
#include "atlas_tst.h"
32
#include "atlas_lvl2.h"
33
#include "atlas_level1.h"
35
int FAx=0, MAx=0, FAy=0, MAy=0, FAa=0, MAa=0;
43
struct FA_allocs *next;
46
struct FA_allocs *NewAlloc(size_t size, struct FA_allocs *next,
47
int align, int misalign)
49
* Allocates size allocation that is aligned to [align], but not aligned
50
* to [misalign]. Therefore, misalign > align. Align must minimally be sizeof
51
* while misalign may be 0 if we don't need to avoid a particular alignment.
58
const int malign = align >= misalign ? align : misalign;
60
n = size + align + align + malign;
64
cp = malloc(n + sizeof(struct FA_allocs));
66
ap = (struct FA_allocs *) (cp + n);
69
* Align to min alignment
71
ap->memA = align ? (void*) ((((size_t) cp)/align)*align + align) : cp;
73
* Misalign to misalign
74
* We often need to make sure to unaligned addresses share the same modulo
75
* so that they have the *same* degree of misalignment (so that their alignment
76
* can be fixed by simple peeling), and so in this case force the address
77
* modulo the misalign to be the exact align value.
80
ap->memA = (void*)((((size_t)ap->memA)/malign)*malign + malign + align);
86
* no-align malloc free retaining system default behavior
88
void *NA_malloc(size_t size)
92
void *NA_calloc(size_t n, size_t size)
94
return(calloc(n, size));
96
void NA_free(void *ptr)
103
* malloc/free pair that aligns data to align, but not to misalign
105
void *FA_malloc(size_t size, int align, int misalign)
107
if ((!misalign && align <= 8) || !size)
108
return(malloc(size));
111
allocQ = NewAlloc(size, allocQ, align, misalign);
112
return(allocQ->memA);
115
void *FA_calloc(size_t n, size_t size, int align, int misalign)
123
cp = FA_malloc(tsize, align, misalign);
124
if (size == sizeof(int))
125
for (ip=(int*)cp,i=0; i < n; i++)
127
else if (size == sizeof(double))
128
for (dp=(double*)cp,i=0; i < n; i++)
131
for (i=0; i < tsize; i++)
136
void FA_free(void *ptr, int align, int misalign)
138
* Part of malloc/free pair that aligns data to FALIGN
141
struct FA_allocs *ap, *prev;
144
if ((!misalign && align <= 8))
148
for (ap=allocQ; ap && ap->memA != ptr; ap = ap->next) prev = ap;
151
fprintf(stderr, "Couldn't find mem=%ld\nmemQ=\n", (size_t)ptr);
152
for (ap=allocQ; ap; ap = ap->next)
153
fprintf(stderr, " %ld, %ld\n", (size_t)ap->memA,
158
allocQ = allocQ->next;
160
prev->next = ap->next;
166
static void dumb_gemv
167
(int Conj, ATL_CINT M, ATL_CINT N, const SCALAR alpha,
168
const TYPE *A, ATL_CINT lda, const TYPE *X, ATL_CINT incX,
169
const SCALAR beta, TYPE *Y, ATL_CINT incY)
171
* If (conj) y = alpha*conj(A^T)*x + beta*y
172
* else y = alpha*(A^T)*x + beta*y
173
* Since this is transpose, and A always MxN:
182
for (j=0; j < N; j++, A += lda)
184
y0 = Mjoin(PATL,dot)(M, X, incX, A, 1);
186
if (beta == ATL_rone)
188
else if (beta != ATL_rzero)
189
Y[j] = beta*Y[j] + y0;
195
ATL_CINT lda2 = lda+lda;
196
const TYPE ra=(*alpha), ia=alpha[1], rb=(*beta), ib=beta[1];
197
TYPE ry, iy, rd, id, tmp;
199
void (*mydot)(const int N, const TYPE *X, const int incX,
200
const TYPE *Y, const int incY, TYPE *dot);
201
mydot = (Conj) ? Mjoin(PATL,dotc_sub) : Mjoin(PATL,dotu_sub);
203
for (j=0; j < N; j++, A += lda2, Y += 2)
205
mydot(M, X, incX, A, 1, dot);
207
* Apply beta to original Y
227
* Apply alpha to dot product
243
* Store alpha*A*x + beta*y to Y
250
static int CheckAns(int M, int N, TYPE *G, int incG, TYPE *T, int incT)
254
const double nadds = NN, nmuls = NN+2;
256
const double epsmul = 2*(nadds+nmuls);
258
const int incG2 = incG+incG, incT2 = incT+incT;
259
const double epsmul = 2*(nadds+4*nmuls);
261
TYPE maxerr, diff, g, t;
262
maxerr = epsmul * Mjoin(PATL,epsilon)();
264
for (i=0; i < NN; i++, G += incG, T += incT)
276
fprintf(stderr, "Y(%d): Good=%f, Computed=%f, diff=%f\n",
281
for (i=0; i < NN+NN; i += 2, G += incG2, T += incT2)
293
fprintf(stderr, "Yr(%d): Good=%f, Computed=%f, diff=%f\n",
306
fprintf(stderr, "Yi(%d): Good=%f, Computed=%f, diff=%f\n",
307
i, G[1], T[1], diff);
316
#define my_gemv Mjoin(PATL,gemvC)
318
#define my_gemv Mjoin(PATL,gemvT)
321
(ATL_CINT M, ATL_CINT N, const SCALAR alpha0, const TYPE *A, ATL_CINT lda,
322
const TYPE *X, ATL_CINT incX, const SCALAR beta0, TYPE *Y, ATL_CINT incY);
324
static int RunTest(int M, int N, int lda, int incX, int incY,
325
SCALAR alpha, SCALAR beta)
328
TYPE one[2] = {ATL_rone, ATL_rzero};
332
TYPE *A, *A0, *X, *Y, *y;
334
ATL_CINT aincY = Mabs(incY), aincX = Mabs(incX);
339
" TEST M=%d, N=%d, lda=%d, iY=%d, iX=%d al=(%.2f,%.2f) be=(%.2f,%.2f) STARTED\n",
340
M, N, lda, incY, incX, *alpha, alpha[1], *beta, beta[1]);
343
" TEST M=%d, N=%d, lda=%d, iY=%d, iX=%d al=%.2f be=%.2f STARTED\n",
344
M, N, lda, incY, incX, alpha, beta);
346
A = FA_malloc(ATL_MulBySize(lda)*N, FAa, MAa);
347
Y = FA_malloc(ATL_MulBySize(NY)*aincY, FAy, MAy);
348
X = FA_malloc(ATL_MulBySize(NX)*aincX, FAy, MAy);
349
Y0 = FA_malloc(ATL_MulBySize(NY)*aincY, FAy, MAy);
350
ATL_assert(A && Y0 && X && Y);
352
Mjoin(PATLU,set)(NX*aincX SHIFT, 4000000000.0, X, 1);
355
Mjoin(PATLU,set)(NY*aincY SHIFT, 2000000000.0, Y0, 1);
356
Mjoin(PATLU,set)(NY*aincY SHIFT, 3000000000.0, Y, 1);
358
Mjoin(PATL,gegen)(1, NY, Y0, aincY, NY*aincY);
359
Mjoin(PATL,gegen)(1, NY, Y, aincY, NY*aincY);
360
Mjoin(PATL,gegen)(1, NX, X, aincX, NY*aincY+127*50+77);
361
Mjoin(PATL,gegen)(M, N, A, lda, N*M+513*7+90);
362
if (incY < 0) Y += (NY-1) * (aincY SHIFT);
363
if (incY < 0) Y0 += (NY-1) * (aincY SHIFT);
364
if (incX < 0) X += (NX-1) * (aincX SHIFT);
367
dumb_gemv(1, M, N, alpha, A, lda, X, incX, beta, Y0, incY);
369
dumb_gemv(0, M, N, alpha, A, lda, X, incX, beta, Y0, incY);
371
my_gemv(M, N, alpha, A, lda, X, incX, beta, Y, incY);
372
ierr = CheckAns(M, N, Y0, incY, Y, incY);
375
" TEST M=%d, N=%d, al,be=%.2f,%.2f, lda=%d, incXY=%d,%d %s\n",
376
M, N, alpha, beta, lda, incX, incY, (ierr) ? "FAILED":"PASSED");
379
" TEST M=%d, N=%d, al,be=(%.2f,%2.f),(%.2f,%.2f), lda=%d, incXY=%d,%d %s\n",
380
M, N, *alpha, alpha[1], *beta, beta[1], lda, incX, incY,
381
(ierr) ? "FAILED":"PASSED");
383
FA_free(A, FAa, MAa);
386
Y -= (NY-1) * (aincY SHIFT);
387
Y0 -= (NY-1) * (aincY SHIFT);
389
FA_free(Y0, FAy, MAy);
390
FA_free(Y, FAy, MAy);
391
if (incX < 0) X -= (NX-1) * (aincX SHIFT);
392
FA_free(X, FAx, MAx);
398
int RunTests(int verb, int *Ms, int *Ns, int *incXs, int *incYs, int *ldas,
399
TYPE *alphas, TYPE *betas)
401
int iy, ix, ic, in, im;
402
ATL_INT m, n, lda, conj, incy;
405
const int NA=(*alphas), NB=(*betas);
411
assert(ldas[0] == Ms[0]);
412
for (in=1; in <= Ns[0]; in++)
415
for (im=1; im <= Ms[0]; im++)
419
for (iy=1; iy <= incYs[0]; iy++)
422
for (ix=1; ix <= incXs[0]; ix++)
425
for (ia=1; ia <= NA; ia++)
428
alpha = alphas+ia+ia;
432
for (ib=1; ib <= NA; ib++)
439
nerr += RunTest(m, n, lda, incx, incy, alpha, beta);
449
void PrintUsage(char *name, int ierr, char *flag)
452
fprintf(stderr, "Bad argument #%d: '%s'\n",
453
ierr, flag ? flag : "Not enough arguments");
455
fprintf(stderr, "ERROR: %s\n", flag);
457
fprintf(stderr, "USAGE: %s [flags]:\n", name);
458
fprintf(stderr, " -n <#> <N1> ... <N#>\n");
459
fprintf(stderr, " -N <Nstart> <Nend> <Ninc>\n");
460
fprintf(stderr, " -m <#> <M1> ... <M#>\n");
461
fprintf(stderr, " -M <Mstart> <Mend> <Minc>\n");
462
fprintf(stderr, " -l <#> <lda1> ... <lda#>\n");
463
fprintf(stderr, " -g <ldagap> : lda = M + <ldagap> foreach M\n");
464
fprintf(stderr, " -y <#> <incY1> ... <incY#>\n");
465
fprintf(stderr, " -x <#> <incX1> ... <incX#>\n");
466
fprintf(stderr, " -C <#> <conj1> ... <conj#>\n");
467
fprintf(stderr, " -a <#> <alpha1> ... <alpha#>\n");
468
fprintf(stderr, " -b <#> <beta1> ... <beta#>\n");
470
" -v [0,1] : 0 - stop on first error, else keep testing\n");
472
" -F[x,y,a] <#> : if(# > 0) -> force op to be aligned to at least # bytes\n");
474
" if(# < 0) -> force op to be aligned to < # bytes.\n");
476
exit(ierr ? ierr : -1);
481
int *GetIntList1(int ival)
483
* returns integer array with iarr[0] = 1, iarr[1] = ival
487
iarr = malloc(2*sizeof(int));
496
TYPE *GetTypeList1(const SCALAR val)
498
* Returns a TYPE array with arr[0] = 1.0, arr[1] = val
502
arr = malloc(ATL_MulBySize(2));
516
int *GetIntList2(int ival1, int ival2)
518
* returns integer array with iarr[0] = 1, iarr[1] = ival1, ival[2] = ival2
522
iarr = malloc(3*sizeof(int));
531
int *DupIntList(int *list)
533
* Duplicates list of integers, list[0] holds the length, not including 0
540
ip = malloc(sizeof(int)*n);
542
for (i=0; i < n; i++)
548
int *GetIntList(int nargs, char **args, int i, int nmul)
550
* Gets a list of integers, whose length is given by atoi(args[i])*nmul
551
* list is this length+1, since 0'th location gets atoi(args[i])
557
PrintUsage(args[0], i, NULL);
558
n = atoi(args[i]) * nmul;
560
iarr = malloc(sizeof(int)*(n+1));
564
for (k=0; k < n; k++)
567
PrintUsage(args[0], i, NULL);
568
iarr[k+1] = atoi(args[i]);
575
TYPE *GetTypeList(int nargs, char **args, int i, int nmul)
577
* Gets a list of TYPEs, whose length is given by atoi(args[i])*nmul
578
* list is this length+1, since 0'th location gets atof(args[i])
585
PrintUsage(args[0], i, NULL);
586
n = atoi(args[i]) * nmul;
588
arr = malloc(ATL_MulBySize(n+1));
592
for (k=0; k < n; k++)
595
PrintUsage(args[0], i, NULL);
596
arr[k+(1 SHIFT)] = atof(args[i]);
603
int *IntRange2IntList(int N0, int NN, int incN)
608
for (i=N0, n=0; i <= NN; i += incN) n++;
609
iarr = malloc(sizeof(int)*(n+1));
612
for (i=N0, n=1 ; i <= NN; i += incN, n++)
617
int GetFlags(int nargs, char **args, int **Ms, int **Ns, int **LDAs,
618
int **incYs, int **incXs, TYPE **ALPHAs, TYPE **BETAs)
624
const TYPE one[2] = {ATL_rone, ATL_rzero};
626
const TYPE one = ATL_rone;
629
*Ns = *Ms = *LDAs = *incYs = *incXs = NULL;
630
*ALPHAs = *BETAs = NULL;
633
for (i=1; i < nargs; i++)
635
if (args[i][0] != '-')
636
PrintUsage(args[0], i, args[i]);
642
PrintUsage(args[0], i-1, "out of flags in -g ");
643
verb = atoi(args[i]);
647
PrintUsage(args[0], i-1, "out of flags in -g ");
648
ldagap = atoi(args[i]);
653
PrintUsage(args[0], i-1, "out of flags in -N/M ");
654
ip = IntRange2IntList(atoi(args[i+1]),atoi(args[i+2]),atoi(args[i+3]));
666
ip = GetIntList(nargs, args, i, 1);
688
*ALPHAs = GetTypeList(nargs, args, i, 1 SHIFT);
689
i += (((int)((*ALPHAs)[0]))SHIFT)+1;
692
*BETAs = GetTypeList(nargs, args, i, 1 SHIFT);
693
i += (((int)((*BETAs)[0]))SHIFT)+1;
696
ch = tolower(args[i][2]);
721
PrintUsage(args[0], i, args[i]);
725
*incXs = GetIntList1(1);
727
*incYs = GetIntList1(1);
729
*ALPHAs = GetTypeList1(one);
731
*BETAs = GetTypeList1(one);
733
*Ms = GetIntList1(977);
735
*Ns = GetIntList1(77);
738
*LDAs = DupIntList(*Ms);
739
for (i=1; i <= (*LDAs)[0]; i++)
740
(*LDAs)[i] += ldagap;
742
assert((*LDAs)[0] == (*Ms)[0]);
746
int main(int nargs, char **args)
748
int *Ms, *Ns, *LDAs, *incYs, *incXs, *CONJs;
750
TYPE *ALPHAs, *BETAs;
752
verb = GetFlags(nargs, args, &Ms, &Ns, &LDAs, &incYs, &incXs,&ALPHAs,&BETAs);
753
ierr = RunTests(verb, Ms, Ns, incXs, incYs, LDAs, ALPHAs, BETAs);