1
#include "atlas_ptmisc.h"
3
#define dumb_rand() ( 0.5 - ((double)rand())/((double)RAND_MAX) )
11
void KMM(const int, const int, const int, const SCALAR, const TYPE*,
12
const int, const TYPE*, const int, const SCALAR, TYPE*, const int);
15
int mb, nb, kb; /* C: mbxnb, At: kbxmb, B: kbXnb */
16
int movA, movB, movC; /* which mat move in flush array? */
17
int FLSIZE; /* min area to move in in bytes */
18
int reps; /* # calls to kmm in one timing */
19
int LDC; /* what should ldc be set to? */
20
int iam; /* thread rank */
21
int p; /* total number of threads */
22
double mf; /* mflop returned by timing */
27
CINT mb, CINT nb, CINT kb, /* C: mbxnb, At: kbxmb, B: kbXnb */
28
CINT movA, CINT movB, CINT movC, /* which mat move in flush array? */
29
int FLSIZE, /* min area to move in in bytes */
30
CINT reps, /* # calls to kmm in one timing */
31
CINT LDC /* what should ldc be set to? */
34
* Returns MFLOP rate of matmul kernel KMM
35
* LDC: if (LDC == 0), then set ldc=MB for timings.
36
* if (LDC != 0 && movC != 0), then ldc= col length in move space
41
const int NOMOVE = !(movA|movB|movC);
42
int ldc, setsz, nset, i, j, incA, incB, incC, n, extra;
43
TYPE *C, *A, *B, *a, *b, *c;
51
ldc = (LDC) ? LDC : mb;
52
setsz = (ldc * nb + kb*(mb+nb));
53
vp = malloc(ATL_Cachelen + ATL_MulBySize(setsz));
58
for (i=0; i < setsz; i++) A[i] = dumb_rand();
59
incA = incB = incC = 0;
63
if (movA && movB && movC) /* no reuse at all */
65
setsz = ATL_MulBySize(mb*nb+kb*(mb+nb));
66
nset = (FLSIZE+setsz-1)/setsz;
68
setsz = mb*nb+kb*(mb+nb);
69
vp = malloc(ATL_Cachelen + ATL_MulBySize(setsz));
74
ldc = (LDC) ? mb*nset : mb;
75
for (n=setsz*nset,i=0; i < n; i++) A[i] = dumb_rand();
80
else if (movA && movB && !movC) /* square-case ATLAS behavior */
83
ldc = (LDC) ? LDC : mb;
84
ATL_assert(ldc >= mb);
90
else if (!movB && movA && movC) /* rank-K behavior */
100
fprintf(stderr, "%s,%d: What case are you wanting?\n",
106
i = ATL_MulBySize(setsz);
107
nset = (FLSIZE+i-1)/i;
109
vp = malloc(ATL_Cachelen + ATL_MulBySize(FLSIZE+extra));
111
A = ATL_AlignPtr(vp);
115
ldc = (LDC) ? mb*nset : mb;
123
for (n=setsz*nset+extra,i=0; i < n; i++) A[i] = dumb_rand();
128
for (j=0,i=reps; i; i--)
130
KMM(mb, nb, kb, alpha, a, kb, b, kb, beta, c, ldc);
139
beta = (beta != 0.0) ? -beta : 0.0;
144
t1 = ATL_walltime() - t0;
145
mf = (2.0*reps*mb*nb*kb) / (t1*1000000.0);
150
void *TimeOnCore(void *vp)
152
struct kmm_struct *kp = vp;
154
kp->mf = GetKmmMflop(kp->mb, kp->nb, kp->kb, kp->movA, kp->movB, kp->movC,
155
kp->FLSIZE, kp->reps, kp->LDC);
159
double *TimeOnCores(struct kmm_struct *kb)
161
struct kmm_struct *kp;
163
pthread_attr_t *attr;
164
unsigned long cpuset;
169
kp = malloc(sizeof(struct kmm_struct)*p);
170
threads = malloc(sizeof(pthread_t)*p);
171
attr = malloc(sizeof(pthread_attr_t)*p);
172
mflops = malloc(sizeof(double)*p);
173
ATL_assert(kp && threads && attr && mflops);
174
for (i=0; i < p; i++)
176
memcpy(kp+i, kb, sizeof(struct kmm_struct));
179
pthread_attr_setaffinity_np(attr+i, sizeof(cpuset), &cpuset);
180
pthread_create(threads+i, attr+i, TimeOnCore, kp+i);
182
for (i=0; i < p; i++)
184
pthread_join(threads[i], NULL);
185
mflops[i] = kp[i].mf;
193
void GetStat(int n, double *d, double *min, double *max, double *avg)
196
double dmin, dmax, dsum;
198
dmin = dmax = dsum = d[0];
199
for (i=1; i < n; i++)
201
dmax = (dmax >= d[i]) ? dmax : d[i];
202
dmin = (dmin <= d[i]) ? dmin : d[i];
207
*avg = dsum / (double)n;
210
void PrintUsage(char *name, int iarg, char *arg)
212
fprintf(stderr, "\nERROR around arg %d (%s).\n", iarg, arg ? arg:"unknown");
213
fprintf(stderr, "USAGE: %s [flags], where flags are:\n", name);
214
fprintf(stderr, " -B <#> : mb = nb = kb = #\n");
215
fprintf(stderr, " -m <#> : mb = #\n");
216
fprintf(stderr, " -n <#> : nb = #\n");
217
fprintf(stderr, " -k <#> : kb = #\n");
218
fprintf(stderr, " -r <#> : set the # of times to call KMM\n");
219
fprintf(stderr, " -F <kb> : set flush size in kilobytes\n");
220
fprintf(stderr, " -C <#> : set ldc; 0 means mb\n");
221
fprintf(stderr, " -M[a,b,c] <#> : mov[A,B,C] = #\n");
222
exit(iarg ? iarg : -1);
225
struct kmm_struct *GetFlags(int nargs, char **args)
227
struct kmm_struct *kp;
230
kp = malloc(sizeof(struct kmm_struct));
233
kp->mb = kp->nb = kp->kb = 40;
234
kp->movA = kp->movB = kp->movC = 0;
238
for (i=1; i < nargs; i++)
240
if (args[i][0] != '-')
241
PrintUsage(args[0], i, args[i]);
246
PrintUsage(args[0], i, "out of arguments");
247
kp->FLSIZE = atoi(args[i]) * 1024;
251
PrintUsage(args[0], i, "out of arguments");
252
kp->LDC = atoi(args[i]);
256
PrintUsage(args[0], i, "out of arguments");
257
kp->reps = atoi(args[i]);
261
PrintUsage(args[0], i, "out of arguments");
262
kp->p = atoi(args[i]);
266
PrintUsage(args[0], i, "out of arguments");
267
kp->mb = atoi(args[i]);
271
PrintUsage(args[0], i, "out of arguments");
272
kp->nb = atoi(args[i]);
276
PrintUsage(args[0], i, "out of arguments");
277
kp->kb = atoi(args[i]);
281
PrintUsage(args[0], i, "out of arguments");
282
kp->mb = kp->nb = kp->kb = atoi(args[i]);
286
PrintUsage(args[0], i, "out of arguments");
291
kp->movC = atoi(args[i]);
295
kp->movB = atoi(args[i]);
299
kp->movA = atoi(args[i]);
302
PrintUsage(args[0], i-1, "unknown mov matrix");
306
PrintUsage(args[0], i, args[i]);
312
int main(int nargs, char **args)
314
struct kmm_struct *kp;
317
double min, max, avg;
318
FILE *fpout = stdout;
320
kp = GetFlags(nargs, args);
322
dp = TimeOnCores(kp);
324
GetStat(p, dp, &min, &max, &avg);
325
fprintf(fpout, "ALL CORES: min=%le, max=%le, avg=%le\n", min, max, avg);
326
fprintf(fpout, "PER-CORE: %le", dp[0]);
327
for (i=1; i < p; i++)
328
fprintf(fpout, ", %le", dp[i]);
329
fprintf(fpout, "\n\n%.2f\n", avg);