2
* Automatically Tuned Linear Algebra Software v3.10.1
3
* Copyright (C) 2010 R. Clint Whaley
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions, and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
13
* 3. The name of the ATLAS group or the names of its contributers may
14
* not be used to endorse or promote products derived from this
15
* software without specific written permission.
17
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
21
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
* POSSIBILITY OF SUCH DAMAGE.
30
#include "atlas_misc.h"
31
#include "atlas_tst.h"
32
#include "atlas_lvl2.h"
33
#include "atlas_level1.h"
35
int FAx=0, MAx=0, FAy=0, MAy=0, FAa=0, MAa=0;
43
struct FA_allocs *next;
46
struct FA_allocs *NewAlloc(size_t size, struct FA_allocs *next,
47
int align, int misalign)
49
* Allocates size allocation that is aligned to [align], but not aligned
50
* to [misalign]. Therefore, misalign > align. Align must minimally be sizeof
51
* while misalign may be 0 if we don't need to avoid a particular alignment.
58
const int malign = align >= misalign ? align : misalign;
60
n = size + align + align + malign;
64
cp = malloc(n + sizeof(struct FA_allocs));
66
ap = (struct FA_allocs *) (cp + n);
69
* Align to min alignment
71
ap->memA = align ? (void*) ((((size_t) cp)/align)*align + align) : cp;
73
* Misalign to misalign
74
* We often need to make sure to unaligned addresses share the same modulo
75
* so that they have the *same* degree of misalignment (so that their alignment
76
* can be fixed by simple peeling), and so in this case force the address
77
* modulo the misalign to be the exact align value.
80
ap->memA = (void*)((((size_t)ap->memA)/malign)*malign + malign + align);
86
* no-align malloc free retaining system default behavior
88
void *NA_malloc(size_t size)
92
void *NA_calloc(size_t n, size_t size)
94
return(calloc(n, size));
96
void NA_free(void *ptr)
103
* malloc/free pair that aligns data to align, but not to misalign
105
void *FA_malloc(size_t size, int align, int misalign)
107
if ((!misalign && align <= 8) || !size)
108
return(malloc(size));
111
allocQ = NewAlloc(size, allocQ, align, misalign);
112
return(allocQ->memA);
115
void *FA_calloc(size_t n, size_t size, int align, int misalign)
123
cp = FA_malloc(tsize, align, misalign);
124
if (size == sizeof(int))
125
for (ip=(int*)cp,i=0; i < n; i++)
127
else if (size == sizeof(double))
128
for (dp=(double*)cp,i=0; i < n; i++)
131
for (i=0; i < tsize; i++)
136
void FA_free(void *ptr, int align, int misalign)
138
* Part of malloc/free pair that aligns data to FALIGN
141
struct FA_allocs *ap, *prev;
144
if ((!misalign && align <= 8))
148
for (ap=allocQ; ap && ap->memA != ptr; ap = ap->next) prev = ap;
151
fprintf(stderr, "Couldn't find mem=%ld\nmemQ=\n", (size_t)ptr);
152
for (ap=allocQ; ap; ap = ap->next)
153
fprintf(stderr, " %ld, %ld\n", (size_t)ap->memA,
158
allocQ = allocQ->next;
160
prev->next = ap->next;
166
static void dumb_gemv
167
(int Conj, ATL_CINT M, ATL_CINT N, const SCALAR alpha,
168
const TYPE *A, ATL_CINT lda, const TYPE *X, ATL_CINT incX,
169
const SCALAR beta, TYPE *Y, ATL_CINT incY)
171
* If (conj) y = alpha*conj(A^T)*x + beta*y
172
* else y = alpha*(A^T)*x + beta*y
173
* Since this is no-transpose, and A always MxN:
180
const TYPE ra=(*alpha), ia=alpha[1];
182
const int lda2 = lda+lda, incX2 = incX+incX;
185
const int lda2 = lda, incX2 = incX;
190
if (!SCALAR_IS_ONE(beta))
192
if (SCALAR_IS_ZERO(beta))
193
Mjoin(PATL,zero)(M, Y, incY);
195
Mjoin(PATL,scal)(M, beta, Y, incY);
197
if (SCALAR_IS_ZERO(alpha) || N < 1)
199
for (j=0; j < N; j++, A += lda2, X += incX2)
204
xa[0] = rx*ra - ix*ia;
205
xa[1] = rx*ia + ix*ra;
209
Mjoin(PATL,axpy)(M, xa, A, 1, Y, incY);
212
static int CheckAns(int M, int N, TYPE *G, int incG, TYPE *T, int incT)
216
const double nadds = NN, nmuls = NN+2;
218
const double epsmul = 2*(nadds+nmuls);
220
const int incG2 = incG+incG, incT2 = incT+incT;
221
const double epsmul = 2*(nadds+4*nmuls);
223
TYPE maxerr, diff, g, t;
224
maxerr = epsmul * Mjoin(PATL,epsilon)();
226
for (i=0; i < NN; i++, G += incG, T += incT)
238
fprintf(stderr, "Y(%d): Good=%f, Computed=%f, diff=%f\n",
243
for (i=0; i < NN+NN; i += 2, G += incG2, T += incT2)
255
fprintf(stderr, "Yr(%d): Good=%f, Computed=%f, diff=%f\n",
268
fprintf(stderr, "Yi(%d): Good=%f, Computed=%f, diff=%f\n",
269
i, G[1], T[1], diff);
277
#define my_gemv Mjoin(PATL,gemvN)
279
(ATL_CINT M, ATL_CINT N, const SCALAR alpha0, const TYPE *A, ATL_CINT lda,
280
const TYPE *X, ATL_CINT incX, const SCALAR beta0, TYPE *Y, ATL_CINT incY);
282
static int RunTest(int M, int N, int lda, int incX, int incY,
283
SCALAR alpha, SCALAR beta)
286
TYPE one[2] = {ATL_rone, ATL_rzero};
290
TYPE *A, *A0, *X, *Y, *y;
292
ATL_CINT aincY = Mabs(incY), aincX = Mabs(incX);
297
" TEST M=%d, N=%d, lda=%d, iY=%d, iX=%d al=(%.2f,%.2f) be=(%.2f,%.2f) STARTED\n",
298
M, N, lda, incY, incX, *alpha, alpha[1], *beta, beta[1]);
301
" TEST M=%d, N=%d, lda=%d, iY=%d, iX=%d al=%.2f be=%.2f STARTED\n",
302
M, N, lda, incY, incX, alpha, beta);
304
A = FA_malloc(ATL_MulBySize(lda)*N, FAa, MAa);
305
Y = FA_malloc(ATL_MulBySize(NY)*aincY, FAy, MAy);
306
X = FA_malloc(ATL_MulBySize(NX)*aincX, FAy, MAy);
307
Y0 = FA_malloc(ATL_MulBySize(NY)*aincY, FAy, MAy);
308
ATL_assert(A && Y0 && X && Y);
310
Mjoin(PATLU,set)(NX*aincX SHIFT, 4000000000.0, X, 1);
313
Mjoin(PATLU,set)(NY*aincY SHIFT, 2000000000.0, Y0, 1);
314
Mjoin(PATLU,set)(NY*aincY SHIFT, 3000000000.0, Y, 1);
316
Mjoin(PATL,gegen)(1, NY, Y0, aincY, NY*aincY);
317
Mjoin(PATL,gegen)(1, NY, Y, aincY, NY*aincY);
318
Mjoin(PATL,gegen)(1, NX, X, aincX, NY*aincY+127*50+77);
319
Mjoin(PATL,gegen)(M, N, A, lda, N*M+513*7+90);
320
if (incY < 0) Y += (NY-1) * (aincY SHIFT);
321
if (incY < 0) Y0 += (NY-1) * (aincY SHIFT);
322
if (incX < 0) X += (NX-1) * (aincX SHIFT);
325
dumb_gemv(1, M, N, alpha, A, lda, X, incX, beta, Y0, incY);
327
dumb_gemv(0, M, N, alpha, A, lda, X, incX, beta, Y0, incY);
329
my_gemv(M, N, alpha, A, lda, X, incX, beta, Y, incY);
330
ierr = CheckAns(M, N, Y0, incY, Y, incY);
333
" TEST M=%d, N=%d, al,be=%.2f,%.2f, lda=%d, incXY=%d,%d %s\n",
334
M, N, alpha, beta, lda, incX, incY, (ierr) ? "FAILED":"PASSED");
337
" TEST M=%d, N=%d, al,be=(%.2f,%2.f),(%.2f,%.2f), lda=%d, incXY=%d,%d %s\n",
338
M, N, *alpha, alpha[1], *beta, beta[1], lda, incX, incY,
339
(ierr) ? "FAILED":"PASSED");
341
FA_free(A, FAa, MAa);
344
Y -= (NY-1) * (aincY SHIFT);
345
Y0 -= (NY-1) * (aincY SHIFT);
347
FA_free(Y0, FAy, MAy);
348
FA_free(Y, FAy, MAy);
349
if (incX < 0) X -= (NX-1) * (aincX SHIFT);
350
FA_free(X, FAx, MAx);
356
int RunTests(int verb, int *Ms, int *Ns, int *incXs, int *incYs, int *ldas,
357
TYPE *alphas, TYPE *betas)
359
int iy, ix, ic, in, im;
360
ATL_INT m, n, lda, conj, incy;
363
const int NA=(*alphas), NB=(*betas);
369
assert(ldas[0] == Ms[0]);
370
for (in=1; in <= Ns[0]; in++)
373
for (im=1; im <= Ms[0]; im++)
377
for (iy=1; iy <= incYs[0]; iy++)
380
for (ix=1; ix <= incXs[0]; ix++)
383
for (ia=1; ia <= NA; ia++)
386
alpha = alphas+ia+ia;
390
for (ib=1; ib <= NA; ib++)
397
nerr += RunTest(m, n, lda, incx, incy, alpha, beta);
407
void PrintUsage(char *name, int ierr, char *flag)
410
fprintf(stderr, "Bad argument #%d: '%s'\n",
411
ierr, flag ? flag : "Not enough arguments");
413
fprintf(stderr, "ERROR: %s\n", flag);
415
fprintf(stderr, "USAGE: %s [flags]:\n", name);
416
fprintf(stderr, " -n <#> <N1> ... <N#>\n");
417
fprintf(stderr, " -N <Nstart> <Nend> <Ninc>\n");
418
fprintf(stderr, " -m <#> <M1> ... <M#>\n");
419
fprintf(stderr, " -M <Mstart> <Mend> <Minc>\n");
420
fprintf(stderr, " -l <#> <lda1> ... <lda#>\n");
421
fprintf(stderr, " -g <ldagap> : lda = M + <ldagap> foreach M\n");
422
fprintf(stderr, " -y <#> <incY1> ... <incY#>\n");
423
fprintf(stderr, " -x <#> <incX1> ... <incX#>\n");
424
fprintf(stderr, " -C <#> <conj1> ... <conj#>\n");
425
fprintf(stderr, " -a <#> <alpha1> ... <alpha#>\n");
426
fprintf(stderr, " -b <#> <beta1> ... <beta#>\n");
428
" -v [0,1] : 0 - stop on first error, else keep testing\n");
430
" -F[x,y,a] <#> : if(# > 0) -> force op to be aligned to at least # bytes\n");
432
" if(# < 0) -> force op to be aligned to < # bytes.\n");
434
exit(ierr ? ierr : -1);
439
int *GetIntList1(int ival)
441
* returns integer array with iarr[0] = 1, iarr[1] = ival
445
iarr = malloc(2*sizeof(int));
454
TYPE *GetTypeList1(const SCALAR val)
456
* Returns a TYPE array with arr[0] = 1.0, arr[1] = val
460
arr = malloc(ATL_MulBySize(2));
474
int *GetIntList2(int ival1, int ival2)
476
* returns integer array with iarr[0] = 1, iarr[1] = ival1, ival[2] = ival2
480
iarr = malloc(3*sizeof(int));
489
int *DupIntList(int *list)
491
* Duplicates list of integers, list[0] holds the length, not including 0
498
ip = malloc(sizeof(int)*n);
500
for (i=0; i < n; i++)
506
int *GetIntList(int nargs, char **args, int i, int nmul)
508
* Gets a list of integers, whose length is given by atoi(args[i])*nmul
509
* list is this length+1, since 0'th location gets atoi(args[i])
515
PrintUsage(args[0], i, NULL);
516
n = atoi(args[i]) * nmul;
518
iarr = malloc(sizeof(int)*(n+1));
522
for (k=0; k < n; k++)
525
PrintUsage(args[0], i, NULL);
526
iarr[k+1] = atoi(args[i]);
533
TYPE *GetTypeList(int nargs, char **args, int i, int nmul)
535
* Gets a list of TYPEs, whose length is given by atoi(args[i])*nmul
536
* list is this length+1, since 0'th location gets atof(args[i])
543
PrintUsage(args[0], i, NULL);
544
n = atoi(args[i]) * nmul;
546
arr = malloc(ATL_MulBySize(n+1));
550
for (k=0; k < n; k++)
553
PrintUsage(args[0], i, NULL);
554
arr[k+(1 SHIFT)] = atof(args[i]);
561
int *IntRange2IntList(int N0, int NN, int incN)
566
for (i=N0, n=0; i <= NN; i += incN) n++;
567
iarr = malloc(sizeof(int)*(n+1));
570
for (i=N0, n=1 ; i <= NN; i += incN, n++)
575
int GetFlags(int nargs, char **args, int **Ms, int **Ns, int **LDAs,
576
int **incYs, int **incXs, TYPE **ALPHAs, TYPE **BETAs)
582
const TYPE one[2] = {ATL_rone, ATL_rzero};
584
const TYPE one = ATL_rone;
587
*Ns = *Ms = *LDAs = *incYs = *incXs = NULL;
588
*ALPHAs = *BETAs = NULL;
591
for (i=1; i < nargs; i++)
593
if (args[i][0] != '-')
594
PrintUsage(args[0], i, args[i]);
600
PrintUsage(args[0], i-1, "out of flags in -g ");
601
verb = atoi(args[i]);
605
PrintUsage(args[0], i-1, "out of flags in -g ");
606
ldagap = atoi(args[i]);
611
PrintUsage(args[0], i-1, "out of flags in -N/M ");
612
ip = IntRange2IntList(atoi(args[i+1]),atoi(args[i+2]),atoi(args[i+3]));
624
ip = GetIntList(nargs, args, i, 1);
646
*ALPHAs = GetTypeList(nargs, args, i, 1 SHIFT);
647
i += (((int)((*ALPHAs)[0]))SHIFT)+1;
650
*BETAs = GetTypeList(nargs, args, i, 1 SHIFT);
651
i += (((int)((*BETAs)[0]))SHIFT)+1;
654
ch = tolower(args[i][2]);
679
PrintUsage(args[0], i, args[i]);
683
*incXs = GetIntList1(1);
685
*incYs = GetIntList1(1);
687
*ALPHAs = GetTypeList1(one);
689
*BETAs = GetTypeList1(one);
691
*Ms = GetIntList1(977);
693
*Ns = GetIntList1(77);
696
*LDAs = DupIntList(*Ms);
697
for (i=1; i <= (*LDAs)[0]; i++)
698
(*LDAs)[i] += ldagap;
700
assert((*LDAs)[0] == (*Ms)[0]);
704
int main(int nargs, char **args)
706
int *Ms, *Ns, *LDAs, *incYs, *incXs, *CONJs;
708
TYPE *ALPHAs, *BETAs;
710
verb = GetFlags(nargs, args, &Ms, &Ns, &LDAs, &incYs, &incXs,&ALPHAs,&BETAs);
711
ierr = RunTests(verb, Ms, Ns, incXs, incYs, LDAs, ALPHAs, BETAs);