1
#include "atlas_taffinity.h"
2
#include "atlas_threads.h"
4
#include "atlas_misc.h"
6
void ATL_goparallel_noaff
7
(const unsigned int P, void *DoWork, void *opstruct, void *DoComb);
11
size_t nflops; /* number of flops to perform */
12
volatile double *V; /* 16-length array of zeros */
19
size_t nflops, /* how many flops to do */
20
volatile double *V /* 16-length array of zeros */
23
* This routine emulates an in-cache 4x4 GEMM, but using only 16 registers
24
* V is declared volatile so compiler doesn't get rid of the loop.
28
register double c0, c1, c2, c3, c4, c5, c6, c7;
29
register double a0, a1, a2, a3, b0, b1, b2, b3;
33
c0 = c1 = c2 = c3 = c4 = c5 = c6 = c7 = ATL_rzero;
34
for (i=(nflops>>5); i; i--)
36
c0 += a0*b0; a3 = V[3];
38
c2 += a2*b0; b1 = V[5];
40
c4 += a0*b1; b2 = V[6];
44
c0 += a0*b2; b3 = V[7];
46
c2 += a2*b2; b0 = V[4];
49
c5 += a1*b3; a1 = V[1];
50
c6 += a2*b3; a2 = V[2];
53
*V = c0; V[1] = c1; V[2] = c2; V[3] = c3;
54
V[4] = c4; V[5] = c5; V[6] = c6; V[7] = c7;
57
void TuneDoWork_gp(ATL_LAUNCHSTRUCT_t *lp, void *vp)
59
ATL_TUNE_T *tp = lp->opstruct;
61
InCacheGemm(tp->nflops, tp->V);
64
void TuneDoWork(ATL_LAUNCHSTRUCT_t *lp, void *vp)
66
ATL_TUNE_T *tp = lp->opstruct;
68
InCacheGemm(tp->nflops, tp->V);
71
void PrintUsage(char *exe)
73
fprintf(stderr, "USAGE: %s [-r <reps>] -m/k/f [m/k/flops] -o outfile\n",
78
int GetFlags(int nargs, char **args, size_t *nflop, char **outfile)
83
*nflop = 2*300 * 300 * 300; /* emulate 300x300 DGEMM */
84
for (i=1; i < nargs; i++)
87
if (args[i][0] != '-')
103
*nflop = atoll(args[i]) * imul;
116
int main(int nargs, char **args)
118
#ifndef ATL_OMP_THREADS
120
int i, k, nreps = 200, opstride, which;
121
double t0, taff, tnoa;
122
ATL_TUNE_T ta[ATL_NTHREADS];
124
void *vp[ATL_NTHREADS];
129
nreps = GetFlags(nargs, args, &nflops, &outfile);
131
for (i=0; i < ATL_NTHREADS; i++)
134
ta[i].nthr = ATL_NTHREADS;
135
ta[i].nflops = nflops;
136
vp[i] = malloc(sizeof(double)*16 + ATL_Cachelen);
138
ta[i].V = ATL_AlignPtr(vp[i]);
139
ATL_dzero(16, (double*)ta[i].V, 1); /* zero w/o telling compiler */
141
opstride = (int) ( ((char*)(ta+1)) - (char*)(ta) );
143
printf("FINDING WHETHER AFFINITY IS HELPFUL USING FLOPS=%e NREPS=%d\n",
144
(double)nflops, nreps);
147
for (k=0; k < nreps; k++)
148
ATL_goparallel(ATL_NTHREADS, TuneDoWork, ta, NULL);
149
taff = ATL_walltime() - t0;
150
printf(" Affinity time = %e\n", (float)taff);
153
for (k=0; k < nreps; k++)
154
ATL_goparallel_noaff(ATL_NTHREADS, TuneDoWork, ta, NULL);
155
tnoa = ATL_walltime() - t0;
156
printf(" NO affinity time = %e\n", (float)tnoa);
158
printf("Affinity speedup = %.2f\n", (float)(tnoa / taff));
160
for (i=0; i < ATL_NTHREADS; i++)
163
if (outfile) /* if this is a real run where we want to change things */
165
if (tnoa*1.04 < taff)
169
"Affinity is not helpful on your system, forcing ATLAS not to use it\n");
170
fpout = fopen(outfile, "w");
172
fprintf(fpout, "#ifndef ATL_TAFFINITY_H\n #define ATL_TAFFINITY_H\n");
173
fprintf(fpout, " #define ATL_NOAFFINITY 1\n");
174
fprintf(fpout, "#endif\n");
176
fpout = fopen("res/aff.h", "w");
177
fprintf(fpout, "#define ATL_TAFFINITY 0\n");
180
else /* affinity was a win */
183
fpout = fopen("res/aff.h", "w");
184
fprintf(fpout, "#define ATL_TAFFINITY 1\n");
194
nreps = GetFlags(nargs, args, &nflops, &outfile);
198
"Not good idea to set affinity wt OpenMP; forcing ATLAS not to use it\n");
199
fpout = fopen(outfile, "w");
201
fprintf(fpout, "#ifndef ATL_TAFFINITY_H\n #define ATL_TAFFINITY_H\n");
202
fprintf(fpout, " #define ATL_NOAFFINITY 1\n");
203
fprintf(fpout, "#endif\n");
205
fpout = fopen("res/aff.h", "w");
206
fprintf(fpout, "#define ATL_TAFFINITY 0\n");