2
* Automatically Tuned Linear Algebra Software v3.10.1
3
* Copyright (C) 2011 R. Clint Whaley
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions, and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
13
* 3. The name of the ATLAS group or the names of its contributers may
14
* not be used to endorse or promote products derived from this
15
* software without specific written permission.
17
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
21
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
* POSSIBILITY OF SUCH DAMAGE.
30
#include "atlas_misc.h"
31
#include "atlas_tst.h"
32
#include "atlas_lvl2.h"
33
#include "atlas_level1.h"
35
int FAx=0, MAx=0, FAy=0, MAy=0, FAa=0, MAa=0;
43
struct FA_allocs *next;
46
struct FA_allocs *NewAlloc(size_t size, struct FA_allocs *next,
47
int align, int misalign)
49
* Allocates size allocation that is aligned to [align], but not aligned
50
* to [misalign]. Therefore, misalign > align. Align must minimally be sizeof
51
* while misalign may be 0 if we don't need to avoid a particular alignment.
58
const int malign = align >= misalign ? align : misalign;
60
n = size + align + align + malign;
64
cp = malloc(n + sizeof(struct FA_allocs));
66
ap = (struct FA_allocs *) (cp + n);
69
* Align to min alignment
71
ap->memA = align ? (void*) ((((size_t) cp)/align)*align + align) : cp;
73
* Misalign to misalign
74
* We often need to make sure to unaligned addresses share the same modulo
75
* so that they have the *same* degree of misalignment (so that their alignment
76
* can be fixed by simple peeling), and so in this case force the address
77
* modulo the misalign to be the exact align value.
80
ap->memA = (void*)((((size_t)ap->memA)/malign)*malign + malign + align);
86
* no-align malloc free retaining system default behavior
88
void *NA_malloc(size_t size)
92
void *NA_calloc(size_t n, size_t size)
94
return(calloc(n, size));
96
void NA_free(void *ptr)
103
* malloc/free pair that aligns data to align, but not to misalign
105
void *FA_malloc(size_t size, int align, int misalign)
107
if ((!misalign && align <= 8) || !size)
108
return(malloc(size));
111
allocQ = NewAlloc(size, allocQ, align, misalign);
112
return(allocQ->memA);
115
void *FA_calloc(size_t n, size_t size, int align, int misalign)
123
cp = FA_malloc(tsize, align, misalign);
124
if (size == sizeof(int))
125
for (ip=(int*)cp,i=0; i < n; i++)
127
else if (size == sizeof(double))
128
for (dp=(double*)cp,i=0; i < n; i++)
131
for (i=0; i < tsize; i++)
136
void FA_free(void *ptr, int align, int misalign)
138
* Part of malloc/free pair that aligns data to FALIGN
141
struct FA_allocs *ap, *prev;
144
if ((!misalign && align <= 8))
148
for (ap=allocQ; ap && ap->memA != ptr; ap = ap->next) prev = ap;
151
fprintf(stderr, "Couldn't find mem=%ld\nmemQ=\n", (size_t)ptr);
152
for (ap=allocQ; ap; ap = ap->next)
153
fprintf(stderr, " %ld, %ld\n", (size_t)ap->memA,
158
allocQ = allocQ->next;
160
prev->next = ap->next;
166
static void dumb_gemv
167
(int Conj, ATL_CINT M, ATL_CINT N, const SCALAR alpha,
168
const TYPE *A, ATL_CINT lda, const TYPE *X, ATL_CINT incX,
169
const SCALAR beta, TYPE *Y, ATL_CINT incY)
171
* If (conj) y = alpha*conj(A^T)*x + beta*y
172
* else y = alpha*(A^T)*x + beta*y
173
* Since this is transpose, and A always MxN:
182
for (j=0; j < N; j++, A += lda)
184
y0 = Mjoin(PATL,dot)(M, X, incX, A, 1);
186
if (beta == ATL_rone)
188
else if (beta != ATL_rzero)
189
Y[j] = beta*Y[j] + y0;
195
ATL_CINT lda2 = lda+lda;
196
const TYPE ra=(*alpha), ia=alpha[1], rb=(*beta), ib=beta[1];
197
TYPE ry, iy, rd, id, tmp;
199
void (*mydot)(const int N, const TYPE *X, const int incX,
200
const TYPE *Y, const int incY, TYPE *dot);
201
mydot = (Conj) ? Mjoin(PATL,dotc_sub) : Mjoin(PATL,dotu_sub);
203
for (j=0; j < N; j++, A += lda2, Y += 2)
205
mydot(M, X, incX, A, 1, dot);
207
* Apply beta to original Y
227
* Apply alpha to dot product
243
* Store alpha*A*x + beta*y to Y
250
static int CheckAns(int M, int N, TYPE *G, int incG, TYPE *T, int incT)
254
const double nadds = NN, nmuls = NN+2;
256
const double epsmul = 2*(nadds+nmuls);
258
const int incG2 = incG+incG, incT2 = incT+incT;
259
const double epsmul = 2*(nadds+4*nmuls);
261
TYPE maxerr, diff, g, t;
262
maxerr = epsmul * Mjoin(PATL,epsilon)();
264
for (i=0; i < NN; i++, G += incG, T += incT)
276
fprintf(stderr, "Y(%d): Good=%f, Computed=%f, diff=%f\n",
281
for (i=0; i < NN+NN; i += 2, G += incG2, T += incT2)
293
fprintf(stderr, "Yr(%d): Good=%f, Computed=%f, diff=%f\n",
306
fprintf(stderr, "Yi(%d): Good=%f, Computed=%f, diff=%f\n",
307
i, G[1], T[1], diff);
316
#define my_gemv Mjoin(PATL,gemvC)
318
#define my_gemv Mjoin(PATL,gemvT)
320
#define ATL_rS2C(sc_) \
321
(((sc_) == ATL_rzero) ? '0' : ( ((sc_) == ATL_rone) ? '1' : 'X'))
323
#define ATL_S2C(sc_) ATL_rS2C(sc_[0]), ATL_rS2C(sc_[1])
325
#define ATL_S2C(sc_) ATL_rS2C(sc_)
328
(ATL_CINT M, ATL_CINT N, const SCALAR alpha0, const TYPE *A, ATL_CINT lda,
329
const TYPE *X, ATL_CINT incX, const SCALAR beta0, TYPE *Y, ATL_CINT incY);
331
static int RunTest(int M, int N, int lda, int incX, int incY,
332
SCALAR alpha, SCALAR beta, int II)
335
TYPE one[2] = {ATL_rone, ATL_rzero};
339
TYPE *A, *A0, *X, *Y, *y;
341
ATL_CINT aincY = Mabs(incY), aincX = Mabs(incX);
343
char *frm = "%6d %5d %5d %5d %4d %4d %c,%c %c,%c %4x %4x %4x %6s\n";
345
char *frm = "%6d %5d %5d %5d %4d %4d %c %c %4x %4x %4x %6s\n";
352
" M N lda incY incX alp bet A X Y PASS?\n");
354
"====== ===== ===== ===== ==== ==== === === ==== ==== ==== ======\n");
356
A = FA_malloc(ATL_MulBySize(lda)*N, FAa, MAa);
357
Y = FA_malloc(ATL_MulBySize(NY)*aincY, FAy, MAy);
358
X = FA_malloc(ATL_MulBySize(NX)*aincX, FAx, MAx);
359
Y0 = FA_malloc(ATL_MulBySize(NY)*aincY, FAy, MAy);
360
ATL_assert(A && Y0 && X && Y);
362
Mjoin(PATLU,set)(NX*aincX SHIFT, 4000000000.0, X, 1);
365
Mjoin(PATLU,set)(NY*aincY SHIFT, 2000000000.0, Y0, 1);
366
Mjoin(PATLU,set)(NY*aincY SHIFT, 3000000000.0, Y, 1);
368
Mjoin(PATL,gegen)(1, NY, Y0, aincY, NY*aincY);
369
printf(frm, II, M, N, lda, incY, incX, ATL_S2C(alpha), ATL_S2C(beta),
370
((size_t)A)&0xFFFF, ((size_t)X)&0xFFFF, ((size_t)Y)&0xFFFF, " START");
371
Mjoin(PATL,gegen)(1, NY, Y, aincY, NY*aincY);
372
Mjoin(PATL,gegen)(1, NX, X, aincX, NY*aincY+127*50+77);
373
Mjoin(PATL,gegen)(M, N, A, lda, N*M+513*7+90);
374
if (incY < 0) Y += (NY-1) * (aincY SHIFT);
375
if (incY < 0) Y0 += (NY-1) * (aincY SHIFT);
376
if (incX < 0) X += (NX-1) * (aincX SHIFT);
379
dumb_gemv(1, M, N, alpha, A, lda, X, incX, beta, Y0, incY);
381
dumb_gemv(0, M, N, alpha, A, lda, X, incX, beta, Y0, incY);
383
my_gemv(M, N, alpha, A, lda, X, incX, beta, Y, incY);
384
ierr = CheckAns(M, N, Y0, incY, Y, incY);
385
FA_free(A, FAa, MAa);
388
Y -= (NY-1) * (aincY SHIFT);
389
Y0 -= (NY-1) * (aincY SHIFT);
391
FA_free(Y0, FAy, MAy);
392
FA_free(Y, FAy, MAy);
393
if (incX < 0) X -= (NX-1) * (aincX SHIFT);
394
FA_free(X, FAx, MAx);
395
printf(frm, II, M, N, lda, incY, incX, ATL_S2C(alpha), ATL_S2C(beta),
396
((size_t)A)&0xFFFF, ((size_t) X)&0xFFFF, ((size_t) Y)&0xFFFF,
397
(ierr) ? "FAILED":"PASSED");
403
int RunTests(int verb, int *Ms, int *Ns, int *incXs, int *incYs, int *ldas,
404
TYPE *alphas, TYPE *betas)
406
int iy, ix, ic, in, im, iax, iay, iaa;
407
ATL_INT m, n, lda, conj, incy;
410
const int NA=(*alphas), NB=(*betas);
416
assert(ldas[0] == Ms[0]);
417
for (in=1; in <= Ns[0]; in++)
420
for (im=1; im <= Ms[0]; im++)
424
for (iy=1; iy <= incYs[0]; iy++)
427
for (ix=1; ix <= incXs[0]; ix++)
430
for (ia=1; ia <= NA; ia++)
433
alpha = alphas+ia+ia;
437
for (ib=1; ib <= NA; ib++)
444
for (iaa=0; iaa < 8; iaa++)
446
FAa = iaa*sizeof(TYPE);
447
MAa = FAa + sizeof(TYPE);
448
for (iay=0; iay < 8; iay++)
450
FAy = iay*sizeof(TYPE);
451
MAy = FAy + sizeof(TYPE);
452
for (iax=0; iax < 8; iax++)
454
FAx = iax*sizeof(TYPE);
455
MAx = FAx + sizeof(TYPE);
456
nerr += RunTest(m, n, lda, incx, incy,
468
printf("FAILED: %d of %d tests!\n", nerr, II);
470
printf("PASSED: all %d tests.\n", II);
474
void PrintUsage(char *name, int ierr, char *flag)
477
fprintf(stderr, "Bad argument #%d: '%s'\n",
478
ierr, flag ? flag : "Not enough arguments");
480
fprintf(stderr, "ERROR: %s\n", flag);
482
fprintf(stderr, "USAGE: %s [flags]:\n", name);
483
fprintf(stderr, " -n <#> <N1> ... <N#>\n");
484
fprintf(stderr, " -N <Nstart> <Nend> <Ninc>\n");
485
fprintf(stderr, " -m <#> <M1> ... <M#>\n");
486
fprintf(stderr, " -M <Mstart> <Mend> <Minc>\n");
487
fprintf(stderr, " -l <#> <lda1> ... <lda#>\n");
488
fprintf(stderr, " -g <ldagap> : lda = M + <ldagap> foreach M\n");
489
fprintf(stderr, " -y <#> <incY1> ... <incY#>\n");
490
fprintf(stderr, " -x <#> <incX1> ... <incX#>\n");
491
fprintf(stderr, " -C <#> <conj1> ... <conj#>\n");
492
fprintf(stderr, " -a <#> <alpha1> ... <alpha#>\n");
493
fprintf(stderr, " -b <#> <beta1> ... <beta#>\n");
495
" -v [0,1] : 0 - stop on first error, else keep testing\n");
497
" -F[x,y,a] <#> : if(# > 0) -> force op to be aligned to at least # bytes\n");
499
" if(# < 0) -> force op to be aligned to < # bytes.\n");
501
exit(ierr ? ierr : -1);
506
int *GetIntList1(int ival)
508
* returns integer array with iarr[0] = 1, iarr[1] = ival
512
iarr = malloc(2*sizeof(int));
521
TYPE *GetTypeList1(const SCALAR val)
523
* Returns a TYPE array with arr[0] = 1.0, arr[1] = val
527
arr = malloc(ATL_MulBySize(2));
541
int *GetIntList2(int ival1, int ival2)
543
* returns integer array with iarr[0] = 1, iarr[1] = ival1, ival[2] = ival2
547
iarr = malloc(3*sizeof(int));
556
int *DupIntList(int *list)
558
* Duplicates list of integers, list[0] holds the length, not including 0
565
ip = malloc(sizeof(int)*n);
567
for (i=0; i < n; i++)
573
int *GetIntList(int nargs, char **args, int i, int nmul)
575
* Gets a list of integers, whose length is given by atoi(args[i])*nmul
576
* list is this length+1, since 0'th location gets atoi(args[i])
582
PrintUsage(args[0], i, NULL);
583
n = atoi(args[i]) * nmul;
585
iarr = malloc(sizeof(int)*(n+1));
589
for (k=0; k < n; k++)
592
PrintUsage(args[0], i, NULL);
593
iarr[k+1] = atoi(args[i]);
600
TYPE *GetTypeList(int nargs, char **args, int i, int nmul)
602
* Gets a list of TYPEs, whose length is given by atoi(args[i])*nmul
603
* list is this length+1, since 0'th location gets atof(args[i])
610
PrintUsage(args[0], i, NULL);
611
n = atoi(args[i]) * nmul;
613
arr = malloc(ATL_MulBySize(n+1));
617
for (k=0; k < n; k++)
620
PrintUsage(args[0], i, NULL);
621
arr[k+(1 SHIFT)] = atof(args[i]);
628
int *IntRange2IntList(int N0, int NN, int incN)
633
for (i=N0, n=0; i <= NN; i += incN) n++;
634
iarr = malloc(sizeof(int)*(n+1));
637
for (i=N0, n=1 ; i <= NN; i += incN, n++)
642
int GetFlags(int nargs, char **args, int **Ms, int **Ns, int **LDAs,
643
int **incYs, int **incXs, TYPE **ALPHAs, TYPE **BETAs)
649
const TYPE one[2] = {ATL_rone, ATL_rzero};
651
const TYPE one = ATL_rone;
654
*Ns = *Ms = *LDAs = *incYs = *incXs = NULL;
655
*ALPHAs = *BETAs = NULL;
658
for (i=1; i < nargs; i++)
660
if (args[i][0] != '-')
661
PrintUsage(args[0], i, args[i]);
667
PrintUsage(args[0], i-1, "out of flags in -g ");
668
verb = atoi(args[i]);
672
PrintUsage(args[0], i-1, "out of flags in -g ");
673
ldagap = atoi(args[i]);
678
PrintUsage(args[0], i-1, "out of flags in -N/M ");
679
ip = IntRange2IntList(atoi(args[i+1]),atoi(args[i+2]),atoi(args[i+3]));
691
ip = GetIntList(nargs, args, i, 1);
713
*ALPHAs = GetTypeList(nargs, args, i, 1 SHIFT);
714
i += (((int)((*ALPHAs)[0]))SHIFT)+1;
717
*BETAs = GetTypeList(nargs, args, i, 1 SHIFT);
718
i += (((int)((*BETAs)[0]))SHIFT)+1;
721
PrintUsage(args[0], i, args[i]);
725
*incXs = GetIntList1(1);
727
*incYs = GetIntList1(1);
729
*ALPHAs = GetTypeList1(one);
731
*BETAs = GetTypeList1(one);
733
*Ms = GetIntList1(977);
735
*Ns = GetIntList1(77);
738
*LDAs = DupIntList(*Ms);
739
for (i=1; i <= (*LDAs)[0]; i++)
740
(*LDAs)[i] += ldagap;
742
assert((*LDAs)[0] == (*Ms)[0]);
746
int main(int nargs, char **args)
748
int *Ms, *Ns, *LDAs, *incYs, *incXs, *CONJs;
750
TYPE *ALPHAs, *BETAs;
752
verb = GetFlags(nargs, args, &Ms, &Ns, &LDAs, &incYs, &incXs,&ALPHAs,&BETAs);
753
ierr = RunTests(verb, Ms, Ns, incXs, incYs, LDAs, ALPHAs, BETAs);