2
* Automatically Tuned Linear Algebra Software v3.10.1
3
* Copyright (C) 2011 R. Clint Whaley
5
* Redistribution and use in source and binary forms, with or without
6
* modification, are permitted provided that the following conditions
8
* 1. Redistributions of source code must retain the above copyright
9
* notice, this list of conditions and the following disclaimer.
10
* 2. Redistributions in binary form must reproduce the above copyright
11
* notice, this list of conditions, and the following disclaimer in the
12
* documentation and/or other materials provided with the distribution.
13
* 3. The name of the ATLAS group or the names of its contributers may
14
* not be used to endorse or promote products derived from this
15
* software without specific written permission.
17
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
19
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ATLAS GROUP OR ITS CONTRIBUTORS
21
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27
* POSSIBILITY OF SUCH DAMAGE.
30
#include "atlas_misc.h"
31
#include "atlas_tst.h"
32
#include "atlas_lvl2.h"
33
#include "atlas_level1.h"
35
int FAx=0, MAx=0, FAy=0, MAy=0, FAa=0, MAa=0;
43
struct FA_allocs *next;
46
struct FA_allocs *NewAlloc(size_t size, struct FA_allocs *next,
47
int align, int misalign)
49
* Allocates size allocation that is aligned to [align], but not aligned
50
* to [misalign]. Therefore, misalign > align. Align must minimally be sizeof
51
* while misalign may be 0 if we don't need to avoid a particular alignment.
58
const int malign = align >= misalign ? align : misalign;
60
n = size + align + align + malign;
64
cp = malloc(n + sizeof(struct FA_allocs));
66
ap = (struct FA_allocs *) (cp + n);
69
* Align to min alignment
71
ap->memA = align ? (void*) ((((size_t) cp)/align)*align + align) : cp;
73
* Misalign to misalign
74
* We often need to make sure to unaligned addresses share the same modulo
75
* so that they have the *same* degree of misalignment (so that their alignment
76
* can be fixed by simple peeling), and so in this case force the address
77
* modulo the misalign to be the exact align value.
80
ap->memA = (void*)((((size_t)ap->memA)/malign)*malign + malign + align);
86
* no-align malloc free retaining system default behavior
88
void *NA_malloc(size_t size)
92
void *NA_calloc(size_t n, size_t size)
94
return(calloc(n, size));
96
void NA_free(void *ptr)
103
* malloc/free pair that aligns data to align, but not to misalign
105
void *FA_malloc(size_t size, int align, int misalign)
107
if ((!misalign && align <= 8) || !size)
108
return(malloc(size));
111
allocQ = NewAlloc(size, allocQ, align, misalign);
112
return(allocQ->memA);
115
void *FA_calloc(size_t n, size_t size, int align, int misalign)
123
cp = FA_malloc(tsize, align, misalign);
124
if (size == sizeof(int))
125
for (ip=(int*)cp,i=0; i < n; i++)
127
else if (size == sizeof(double))
128
for (dp=(double*)cp,i=0; i < n; i++)
131
for (i=0; i < tsize; i++)
136
void FA_free(void *ptr, int align, int misalign)
138
* Part of malloc/free pair that aligns data to FALIGN
141
struct FA_allocs *ap, *prev;
144
if ((!misalign && align <= 8))
148
for (ap=allocQ; ap && ap->memA != ptr; ap = ap->next) prev = ap;
151
fprintf(stderr, "Couldn't find mem=%ld\nmemQ=\n", (size_t)ptr);
152
for (ap=allocQ; ap; ap = ap->next)
153
fprintf(stderr, " %ld, %ld\n", (size_t)ap->memA,
158
allocQ = allocQ->next;
160
prev->next = ap->next;
166
static void dumb_ger(int Conj, int M, int N, const SCALAR alpha, TYPE *X,
167
int incX, TYPE *Y, int incY, TYPE *A, int lda)
171
const TYPE ra = alpha[0], ia = alpha[1];
175
for (j=0; j < N; j++)
178
const TYPE al = alpha * Y[j*incY];
179
Mjoin(PATL,axpy)(M, al, X, incX, A+lda*j, 1);
181
const TYPE rY = Y[2*j*incY], iY = Y[2*j*incY+1];
182
tmp[0] = rY*ra - iY*ia;
183
tmp[1] = rY*ia + iY*ra;
184
if (Conj) tmp[1] = -tmp[1];
185
Mjoin(PATL,axpy)(M, tmp, X, incX, A+2*lda*j, 1);
189
static int CheckAns(int M, int N, TYPE *G, int ldg, TYPE *U, int ldu)
194
const int M2 = M, emul=4;
196
const int M2 = M<<1, emul=4*4;
197
ldg <<= 1; ldu <<= 1;
200
eps = Mjoin(PATL,epsilon)();
201
for (j=0; j < N; j++, G += ldg, U += ldu)
203
for (i=0; i < M2; i++)
206
if (diff < ATL_rzero) diff = -diff;
209
fprintf(stderr, "A(%d,%d): Good=%f, Computed=%f\n",
211
if (!ierr) ierr = i+j*M+1;
221
#define ATL_gerT Mjoin(PATL,geru)
223
#define ATL_gerT Mjoin(PATL,ger)
225
void ATL_gerT(ATL_CINT M, ATL_CINT N, const SCALAR alpha,
226
const TYPE *X, ATL_CINT incX, const TYPE *Y, ATL_CINT incY,
227
TYPE *A, ATL_CINT lda);
228
#define ATL_rS2C(sc_) \
229
(((sc_) == ATL_rzero) ? '0' : ( ((sc_) == ATL_rone) ? '1' : 'X'))
231
#define ATL_S2C(sc_) ATL_rS2C(sc_[0]), ATL_rS2C(sc_[1])
233
#define ATL_S2C(sc_) ATL_rS2C(sc_)
236
static int RunTest(int CONJ, int M, int N, int incY, int lda, int II)
239
TYPE one[2] = {ATL_rone, ATL_rzero};
241
const TYPE alpha[2] = {ATL_rzero, ATL_rzero};
242
#elif defined(ALPHAX)
243
const TYPE alpha[2] = {ATL_rone, ATL_rzero};
245
const TYPE alpha[2] = {0.5, 2.1};
250
const TYPE alpha = ATL_rzero;
251
#elif defined(ALPHAX)
252
const TYPE alpha = ATL_rone;
254
const TYPE alpha = 2.0;
257
TYPE *A, *A0, *X, *Y, *y;
258
ATL_CINT aincY = Mabs(incY), incX=1, aincX=1;
260
char *frm = "%6d %5d %5d %5d %4d %4d %c,%c %4x %4x %4x %6s\n";
262
char *frm = "%6d %5d %5d %5d %4d %4d %c %4x %4x %4x %6s\n";
269
" M N lda incY incX alp A X Y PASS?\n");
271
"====== ===== ===== ===== ==== ==== === ==== ==== ==== ======\n");
273
A = FA_malloc(ATL_MulBySize(lda)*N, FAa, MAa);
274
A0 = FA_malloc(ATL_MulBySize(M)*N, FAa, MAa);
275
Y = FA_malloc(ATL_MulBySize(NY)*aincY, FAy, MAy);
276
X = FA_malloc(ATL_MulBySize(NX), FAx, MAx);
277
ATL_assert(A && A0 && X && Y);
278
printf(frm, II, M, N, lda, incY, incX, ATL_S2C(alpha), ((size_t)A)&0xFFFF,
279
((size_t)X)&0xFFFF, ((size_t)Y)&0xFFFF, " START");
280
Mjoin(PATL,gegen)(1, NY, Y, aincY, NY*aincY);
281
Mjoin(PATL,gegen)(1, NX, X, aincX, NY*aincY+127*50+77);
282
Mjoin(PATL,gegen)(M, N, A0, M, N*M+513*7+90);
283
Mjoin(PATL,gegen)(M, N, A, lda, N*M+513*7+90);
284
if (incY < 0) Y += (NY-1) * (aincY SHIFT);
286
ATL_gerT(M, N, alpha, X, incX, Y, incY, A, lda);
287
dumb_ger(CONJ, M, N, alpha, X, 1, Y, incY, A0, M);
289
if (incY < 0) Y -= (N-1) * (aincY SHIFT);
290
FA_free(Y, FAy, MAy);
291
FA_free(X, FAx, MAx);
292
ierr = CheckAns(M, N, A0, M, A, lda);
293
FA_free(A, FAa, MAa);
294
FA_free(A0, FAa, MAa);
296
printf(frm, II, M, N, lda, incY, incX, ATL_S2C(alpha), ((size_t)A)&0xFFFF,
297
((size_t) X)&0xFFFF, ((size_t) Y)&0xFFFF, (ierr) ? "FAILED":"PASSED");
303
int RunTests(int verb, int *CONJs, int *Ms, int *Ns, int *incYs, int *ldas)
305
int iy, ix, ic, in, im, iax, iay, iaa;
306
ATL_INT m, n, lda, conj, incy;
308
assert(ldas[0] == Ms[0]);
309
for (in=1; in <= Ns[0]; in++)
312
for (im=1; im <= Ms[0]; im++)
316
for (iy=1; iy <= incYs[0]; iy++)
319
for (ic=1; ic <= CONJs[0]; ic++)
322
for (iaa=0; iaa < 8; iaa++)
324
FAa = iaa*sizeof(TYPE);
325
MAa = FAa + sizeof(TYPE);
326
for (iay=0; iay < 8; iay++)
328
FAy = iay*sizeof(TYPE);
329
MAy = FAy + sizeof(TYPE);
330
for (iax=0; iax < 8; iax++)
332
FAx = iax*sizeof(TYPE);
333
MAx = FAx + sizeof(TYPE);
334
nerr += RunTest(conj, m, n, incy, lda, II++);
345
printf("FAILED: %d of %d tests!\n", nerr, II);
347
printf("PASSED: all %d tests.\n", II);
351
void PrintUsage(char *name, int ierr, char *flag)
354
fprintf(stderr, "Bad argument #%d: '%s'\n",
355
ierr, flag ? flag : "Not enough arguments");
357
fprintf(stderr, "ERROR: %s\n", flag);
359
fprintf(stderr, "USAGE: %s [flags]:\n", name);
360
fprintf(stderr, " -n <#> <N1> ... <N#>\n");
361
fprintf(stderr, " -N <Nstart> <Nend> <Ninc>\n");
362
fprintf(stderr, " -m <#> <M1> ... <M#>\n");
363
fprintf(stderr, " -M <Mstart> <Mend> <Minc>\n");
364
fprintf(stderr, " -l <#> <lda1> ... <lda#>\n");
365
fprintf(stderr, " -g <ldagap> : lda = M + <ldagap> foreach M\n");
366
fprintf(stderr, " -y <#> <incY1> ... <incY#>\n");
367
fprintf(stderr, " -x <#> <incX1> ... <incX#>\n");
368
fprintf(stderr, " -C <#> <conj1> ... <conj#>\n");
370
" -v [0,1] : 0 - stop on first error, else keep testing\n");
372
" -F[x,y,a] <#> : if(# > 0) -> force op to be aligned to at least # bytes\n");
374
" if(# < 0) -> force op to be aligned to < # bytes.\n");
376
exit(ierr ? ierr : -1);
381
int *GetIntList1(int ival)
383
* returns integer array with iarr[0] = 1, iarr[1] = ival
387
iarr = malloc(2*sizeof(int));
396
TYPE *GetTypeList1(const SCALAR val)
398
* Returns a TYPE array with arr[0] = 1.0, arr[1] = val
402
arr = malloc(ATL_MulBySize(2));
416
int *GetIntList2(int ival1, int ival2)
418
* returns integer array with iarr[0] = 1, iarr[1] = ival1, ival[2] = ival2
422
iarr = malloc(3*sizeof(int));
431
int *DupIntList(int *list)
433
* Duplicates list of integers, list[0] holds the length, not including 0
440
ip = malloc(sizeof(int)*n);
442
for (i=0; i < n; i++)
448
int *GetIntList(int nargs, char **args, int i, int nmul)
450
* Gets a list of integers, whose length is given by atoi(args[i])*nmul
451
* list is this length+1, since 0'th location gets atoi(args[i])
457
PrintUsage(args[0], i, NULL);
458
n = atoi(args[i]) * nmul;
460
iarr = malloc(sizeof(int)*(n+1));
464
for (k=0; k < n; k++)
467
PrintUsage(args[0], i, NULL);
468
iarr[k+1] = atoi(args[i]);
475
TYPE *GetTypeList(int nargs, char **args, int i, int nmul)
477
* Gets a list of TYPEs, whose length is given by atoi(args[i])*nmul
478
* list is this length+1, since 0'th location gets atof(args[i])
485
PrintUsage(args[0], i, NULL);
486
n = atoi(args[i]) * nmul;
488
arr = malloc(ATL_MulBySize(n+1));
492
for (k=0; k < n; k++)
495
PrintUsage(args[0], i, NULL);
496
arr[k+(1 SHIFT)] = atof(args[i]);
503
int *IntRange2IntList(int N0, int NN, int incN)
508
for (i=N0, n=0; i <= NN; i += incN) n++;
509
iarr = malloc(sizeof(int)*(n+1));
512
for (i=N0, n=1 ; i <= NN; i += incN, n++)
517
int GetFlags(int nargs, char **args, int **CONJs, int **Ms, int **Ns,
518
int **LDAs, int **incYs, int **incXs)
524
*Ns = *Ms = *LDAs = *incYs = *incXs = *CONJs = NULL;
527
for (i=1; i < nargs; i++)
529
if (args[i][0] != '-')
530
PrintUsage(args[0], i, args[i]);
536
PrintUsage(args[0], i-1, "out of flags in -g ");
537
verb = atoi(args[i]);
541
PrintUsage(args[0], i-1, "out of flags in -g ");
542
ldagap = atoi(args[i]);
547
PrintUsage(args[0], i-1, "out of flags in -N/M ");
548
ip = IntRange2IntList(atoi(args[i+1]),atoi(args[i+2]),atoi(args[i+3]));
560
ip = GetIntList(nargs, args, i, 1);
582
PrintUsage(args[0], i, args[i]);
587
*CONJs = GetIntList2(0, 1);
589
*CONJs = GetIntList1(0);
592
*incXs = GetIntList1(1);
594
*incYs = GetIntList1(1);
596
*Ms = GetIntList1(977);
598
*Ns = GetIntList1(77);
601
*LDAs = DupIntList(*Ms);
602
for (i=1; i <= (*LDAs)[0]; i++)
603
(*LDAs)[i] += ldagap;
605
assert((*LDAs)[0] == (*Ms)[0]);
609
int main(int nargs, char **args)
611
int *Ms, *Ns, *LDAs, *incYs, *incXs, *CONJs;
614
verb = GetFlags(nargs, args, &CONJs, &Ms, &Ns, &LDAs, &incYs, &incXs);
615
ierr = RunTests(verb, CONJs, Ms, Ns, incYs, LDAs);