2
* Automatically Tuned Linear Algebra Software v3.10.1
3
* Copyright (C) 2011 R. Clint Whaley
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions, and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
13
* 3. The name of the ATLAS group or the names of its contributers may
14
* not be used to endorse or promote products derived from this
15
* software without specific written permission.
17
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
21
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
* POSSIBILITY OF SUCH DAMAGE.
30
#include "atlas_misc.h"
31
#include "atlas_tst.h"
32
#include "atlas_lvl2.h"
33
#include "atlas_level1.h"
35
int FAx=0, MAx=0, FAy=0, MAy=0, FAa=0, MAa=0;
43
struct FA_allocs *next;
46
struct FA_allocs *NewAlloc(size_t size, struct FA_allocs *next,
47
int align, int misalign)
49
* Allocates size allocation that is aligned to [align], but not aligned
50
* to [misalign]. Therefore, misalign > align. Align must minimally be sizeof
51
* while misalign may be 0 if we don't need to avoid a particular alignment.
58
const int malign = align >= misalign ? align : misalign;
60
n = size + align + align + malign;
64
cp = malloc(n + sizeof(struct FA_allocs));
66
ap = (struct FA_allocs *) (cp + n);
69
* Align to min alignment
71
ap->memA = align ? (void*) ((((size_t) cp)/align)*align + align) : cp;
73
* Misalign to misalign
74
* We often need to make sure to unaligned addresses share the same modulo
75
* so that they have the *same* degree of misalignment (so that their alignment
76
* can be fixed by simple peeling), and so in this case force the address
77
* modulo the misalign to be the exact align value.
80
ap->memA = (void*)((((size_t)ap->memA)/malign)*malign + malign + align);
86
* no-align malloc free retaining system default behavior
88
void *NA_malloc(size_t size)
92
void *NA_calloc(size_t n, size_t size)
94
return(calloc(n, size));
96
void NA_free(void *ptr)
103
* malloc/free pair that aligns data to align, but not to misalign
105
void *FA_malloc(size_t size, int align, int misalign)
107
if ((!misalign && align <= 8) || !size)
108
return(malloc(size));
111
allocQ = NewAlloc(size, allocQ, align, misalign);
112
return(allocQ->memA);
115
void *FA_calloc(size_t n, size_t size, int align, int misalign)
123
cp = FA_malloc(tsize, align, misalign);
124
if (size == sizeof(int))
125
for (ip=(int*)cp,i=0; i < n; i++)
127
else if (size == sizeof(double))
128
for (dp=(double*)cp,i=0; i < n; i++)
131
for (i=0; i < tsize; i++)
136
void FA_free(void *ptr, int align, int misalign)
138
* Part of malloc/free pair that aligns data to FALIGN
141
struct FA_allocs *ap, *prev;
144
if ((!misalign && align <= 8))
148
for (ap=allocQ; ap && ap->memA != ptr; ap = ap->next) prev = ap;
151
fprintf(stderr, "Couldn't find mem=%ld\nmemQ=\n", (size_t)ptr);
152
for (ap=allocQ; ap; ap = ap->next)
153
fprintf(stderr, " %ld, %ld\n", (size_t)ap->memA,
158
allocQ = allocQ->next;
160
prev->next = ap->next;
166
static void dumb_gemv
167
(int Conj, ATL_CINT M, ATL_CINT N, const SCALAR alpha,
168
const TYPE *A, ATL_CINT lda, const TYPE *X, ATL_CINT incX,
169
const SCALAR beta, TYPE *Y, ATL_CINT incY)
171
* If (conj) y = alpha*conj(A^T)*x + beta*y
172
* else y = alpha*(A^T)*x + beta*y
173
* Since this is no-transpose, and A always MxN:
180
const TYPE ra=(*alpha), ia=alpha[1];
182
const int lda2 = lda+lda, incX2 = incX+incX;
185
const int lda2 = lda, incX2 = incX;
190
if (!SCALAR_IS_ONE(beta))
192
if (SCALAR_IS_ZERO(beta))
193
Mjoin(PATL,zero)(M, Y, incY);
195
Mjoin(PATL,scal)(M, beta, Y, incY);
197
if (SCALAR_IS_ZERO(alpha) || N < 1)
199
for (j=0; j < N; j++, A += lda2, X += incX2)
204
xa[0] = rx*ra - ix*ia;
205
xa[1] = rx*ia + ix*ra;
209
Mjoin(PATL,axpy)(M, xa, A, 1, Y, incY);
212
static int CheckAns(int M, int N, TYPE *G, int incG, TYPE *T, int incT)
216
const double nadds = NN, nmuls = NN+2;
218
const double epsmul = 2*(nadds+nmuls);
220
const int incG2 = incG+incG, incT2 = incT+incT;
221
const double epsmul = 2*(nadds+4*nmuls);
223
TYPE maxerr, diff, g, t;
224
maxerr = epsmul * Mjoin(PATL,epsilon)();
226
for (i=0; i < NN; i++, G += incG, T += incT)
238
fprintf(stderr, "Y(%d): Good=%f, Computed=%f, diff=%f\n",
243
for (i=0; i < NN+NN; i += 2, G += incG2, T += incT2)
255
fprintf(stderr, "Yr(%d): Good=%f, Computed=%f, diff=%f\n",
268
fprintf(stderr, "Yi(%d): Good=%f, Computed=%f, diff=%f\n",
269
i, G[1], T[1], diff);
277
#define my_gemv Mjoin(PATL,gemvN)
278
#define ATL_rS2C(sc_) \
279
(((sc_) == ATL_rzero) ? '0' : ( ((sc_) == ATL_rone) ? '1' : 'X'))
281
#define ATL_S2C(sc_) ATL_rS2C(sc_[0]), ATL_rS2C(sc_[1])
283
#define ATL_S2C(sc_) ATL_rS2C(sc_)
286
(ATL_CINT M, ATL_CINT N, const SCALAR alpha0, const TYPE *A, ATL_CINT lda,
287
const TYPE *X, ATL_CINT incX, const SCALAR beta0, TYPE *Y, ATL_CINT incY);
289
static int RunTest(int M, int N, int lda, int incX, int incY,
290
SCALAR alpha, SCALAR beta, int II)
293
TYPE one[2] = {ATL_rone, ATL_rzero};
297
TYPE *A, *A0, *X, *Y, *y;
299
ATL_CINT aincY = Mabs(incY), aincX = Mabs(incX);
301
char *frm = "%6d %5d %5d %5d %4d %4d %c,%c %c,%c %4x %4x %4x %6s\n";
303
char *frm = "%6d %5d %5d %5d %4d %4d %c %c %4x %4x %4x %6s\n";
310
" M N lda incY incX alp bet A X Y PASS?\n");
312
"====== ===== ===== ===== ==== ==== === === ==== ==== ==== ======\n");
314
A = FA_malloc(ATL_MulBySize(lda)*N, FAa, MAa);
315
Y = FA_malloc(ATL_MulBySize(NY)*aincY, FAy, MAy);
316
X = FA_malloc(ATL_MulBySize(NX)*aincX, FAx, MAx);
317
Y0 = FA_malloc(ATL_MulBySize(NY)*aincY, FAy, MAy);
318
ATL_assert(A && Y0 && X && Y);
320
Mjoin(PATLU,set)(NX*aincX SHIFT, 4000000000.0, X, 1);
323
Mjoin(PATLU,set)(NY*aincY SHIFT, 2000000000.0, Y0, 1);
324
Mjoin(PATLU,set)(NY*aincY SHIFT, 3000000000.0, Y, 1);
326
Mjoin(PATL,gegen)(1, NY, Y0, aincY, NY*aincY);
327
printf(frm, II, M, N, lda, incY, incX, ATL_S2C(alpha), ATL_S2C(beta),
328
((size_t)A)&0xFFFF, ((size_t)X)&0xFFFF, ((size_t)Y)&0xFFFF, " START");
329
Mjoin(PATL,gegen)(1, NY, Y, aincY, NY*aincY);
330
Mjoin(PATL,gegen)(1, NX, X, aincX, NY*aincY+127*50+77);
331
Mjoin(PATL,gegen)(M, N, A, lda, N*M+513*7+90);
332
if (incY < 0) Y += (NY-1) * (aincY SHIFT);
333
if (incY < 0) Y0 += (NY-1) * (aincY SHIFT);
334
if (incX < 0) X += (NX-1) * (aincX SHIFT);
337
dumb_gemv(1, M, N, alpha, A, lda, X, incX, beta, Y0, incY);
339
dumb_gemv(0, M, N, alpha, A, lda, X, incX, beta, Y0, incY);
341
my_gemv(M, N, alpha, A, lda, X, incX, beta, Y, incY);
342
ierr = CheckAns(M, N, Y0, incY, Y, incY);
343
FA_free(A, FAa, MAa);
346
Y -= (NY-1) * (aincY SHIFT);
347
Y0 -= (NY-1) * (aincY SHIFT);
349
FA_free(Y0, FAy, MAy);
350
FA_free(Y, FAy, MAy);
351
if (incX < 0) X -= (NX-1) * (aincX SHIFT);
352
FA_free(X, FAx, MAx);
353
printf(frm, II, M, N, lda, incY, incX, ATL_S2C(alpha), ATL_S2C(beta),
354
((size_t)A)&0xFFFF, ((size_t) X)&0xFFFF, ((size_t) Y)&0xFFFF,
355
(ierr) ? "FAILED":"PASSED");
361
int RunTests(int verb, int *Ms, int *Ns, int *incXs, int *incYs, int *ldas,
362
TYPE *alphas, TYPE *betas)
364
int iy, ix, ic, in, im, iax, iay, iaa;
365
ATL_INT m, n, lda, conj, incy;
368
const int NA=(*alphas), NB=(*betas);
374
assert(ldas[0] == Ms[0]);
375
for (in=1; in <= Ns[0]; in++)
378
for (im=1; im <= Ms[0]; im++)
382
for (iy=1; iy <= incYs[0]; iy++)
385
for (ix=1; ix <= incXs[0]; ix++)
388
for (ia=1; ia <= NA; ia++)
391
alpha = alphas+ia+ia;
395
for (ib=1; ib <= NA; ib++)
402
for (iaa=0; iaa < 8; iaa++)
404
FAa = iaa*sizeof(TYPE);
405
MAa = FAa + sizeof(TYPE);
406
for (iay=0; iay < 8; iay++)
408
FAy = iay*sizeof(TYPE);
409
MAy = FAy + sizeof(TYPE);
410
for (iax=0; iax < 8; iax++)
412
FAx = iax*sizeof(TYPE);
413
MAx = FAx + sizeof(TYPE);
414
nerr += RunTest(m, n, lda, incx, incy,
426
printf("FAILED: %d of %d tests!\n", nerr, II);
428
printf("PASSED: all %d tests.\n", II);
432
void PrintUsage(char *name, int ierr, char *flag)
435
fprintf(stderr, "Bad argument #%d: '%s'\n",
436
ierr, flag ? flag : "Not enough arguments");
438
fprintf(stderr, "ERROR: %s\n", flag);
440
fprintf(stderr, "USAGE: %s [flags]:\n", name);
441
fprintf(stderr, " -n <#> <N1> ... <N#>\n");
442
fprintf(stderr, " -N <Nstart> <Nend> <Ninc>\n");
443
fprintf(stderr, " -m <#> <M1> ... <M#>\n");
444
fprintf(stderr, " -M <Mstart> <Mend> <Minc>\n");
445
fprintf(stderr, " -l <#> <lda1> ... <lda#>\n");
446
fprintf(stderr, " -g <ldagap> : lda = M + <ldagap> foreach M\n");
447
fprintf(stderr, " -y <#> <incY1> ... <incY#>\n");
448
fprintf(stderr, " -x <#> <incX1> ... <incX#>\n");
449
fprintf(stderr, " -C <#> <conj1> ... <conj#>\n");
450
fprintf(stderr, " -a <#> <alpha1> ... <alpha#>\n");
451
fprintf(stderr, " -b <#> <beta1> ... <beta#>\n");
453
" -v [0,1] : 0 - stop on first error, else keep testing\n");
455
" -F[x,y,a] <#> : if(# > 0) -> force op to be aligned to at least # bytes\n");
457
" if(# < 0) -> force op to be aligned to < # bytes.\n");
459
exit(ierr ? ierr : -1);
464
int *GetIntList1(int ival)
466
* returns integer array with iarr[0] = 1, iarr[1] = ival
470
iarr = malloc(2*sizeof(int));
479
TYPE *GetTypeList1(const SCALAR val)
481
* Returns a TYPE array with arr[0] = 1.0, arr[1] = val
485
arr = malloc(ATL_MulBySize(2));
499
int *GetIntList2(int ival1, int ival2)
501
* returns integer array with iarr[0] = 1, iarr[1] = ival1, ival[2] = ival2
505
iarr = malloc(3*sizeof(int));
514
int *DupIntList(int *list)
516
* Duplicates list of integers, list[0] holds the length, not including 0
523
ip = malloc(sizeof(int)*n);
525
for (i=0; i < n; i++)
531
int *GetIntList(int nargs, char **args, int i, int nmul)
533
* Gets a list of integers, whose length is given by atoi(args[i])*nmul
534
* list is this length+1, since 0'th location gets atoi(args[i])
540
PrintUsage(args[0], i, NULL);
541
n = atoi(args[i]) * nmul;
543
iarr = malloc(sizeof(int)*(n+1));
547
for (k=0; k < n; k++)
550
PrintUsage(args[0], i, NULL);
551
iarr[k+1] = atoi(args[i]);
558
TYPE *GetTypeList(int nargs, char **args, int i, int nmul)
560
* Gets a list of TYPEs, whose length is given by atoi(args[i])*nmul
561
* list is this length+1, since 0'th location gets atof(args[i])
568
PrintUsage(args[0], i, NULL);
569
n = atoi(args[i]) * nmul;
571
arr = malloc(ATL_MulBySize(n+1));
575
for (k=0; k < n; k++)
578
PrintUsage(args[0], i, NULL);
579
arr[k+(1 SHIFT)] = atof(args[i]);
586
int *IntRange2IntList(int N0, int NN, int incN)
591
for (i=N0, n=0; i <= NN; i += incN) n++;
592
iarr = malloc(sizeof(int)*(n+1));
595
for (i=N0, n=1 ; i <= NN; i += incN, n++)
600
int GetFlags(int nargs, char **args, int **Ms, int **Ns, int **LDAs,
601
int **incYs, int **incXs, TYPE **ALPHAs, TYPE **BETAs)
607
const TYPE one[2] = {ATL_rone, ATL_rzero};
609
const TYPE one = ATL_rone;
612
*Ns = *Ms = *LDAs = *incYs = *incXs = NULL;
613
*ALPHAs = *BETAs = NULL;
616
for (i=1; i < nargs; i++)
618
if (args[i][0] != '-')
619
PrintUsage(args[0], i, args[i]);
625
PrintUsage(args[0], i-1, "out of flags in -g ");
626
verb = atoi(args[i]);
630
PrintUsage(args[0], i-1, "out of flags in -g ");
631
ldagap = atoi(args[i]);
636
PrintUsage(args[0], i-1, "out of flags in -N/M ");
637
ip = IntRange2IntList(atoi(args[i+1]),atoi(args[i+2]),atoi(args[i+3]));
649
ip = GetIntList(nargs, args, i, 1);
671
*ALPHAs = GetTypeList(nargs, args, i, 1 SHIFT);
672
i += (((int)((*ALPHAs)[0]))SHIFT)+1;
675
*BETAs = GetTypeList(nargs, args, i, 1 SHIFT);
676
i += (((int)((*BETAs)[0]))SHIFT)+1;
679
PrintUsage(args[0], i, args[i]);
683
*incXs = GetIntList1(1);
685
*incYs = GetIntList1(1);
687
*ALPHAs = GetTypeList1(one);
689
*BETAs = GetTypeList1(one);
691
*Ms = GetIntList1(977);
693
*Ns = GetIntList1(77);
696
*LDAs = DupIntList(*Ms);
697
for (i=1; i <= (*LDAs)[0]; i++)
698
(*LDAs)[i] += ldagap;
700
assert((*LDAs)[0] == (*Ms)[0]);
704
int main(int nargs, char **args)
706
int *Ms, *Ns, *LDAs, *incYs, *incXs, *CONJs;
708
TYPE *ALPHAs, *BETAs;
710
verb = GetFlags(nargs, args, &Ms, &Ns, &LDAs, &incYs, &incXs,&ALPHAs,&BETAs);
711
ierr = RunTests(verb, Ms, Ns, incXs, incYs, LDAs, ALPHAs, BETAs);