2
* Copyright (c) 2003, 2006 Matteo Frigo
3
* Copyright (c) 2003, 2006 Massachusetts Institute of Technology
5
* This program is free software; you can redistribute it and/or modify
6
* it under the terms of the GNU General Public License as published by
7
* the Free Software Foundation; either version 2 of the License, or
8
* (at your option) any later version.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
/* Lots of ugly duplication from verify-lib.c, plus lots of ugliness in
22
general for all of the r2r variants...oh well, for now */
32
bench_tensor *totalsz;
34
bench_tensor *pckdvecsz;
41
static double dabs(double x) { return (x < 0.0) ? -x : x; }
42
static double dmin(double x, double y) { return (x < y) ? x : y; }
44
static double raerror(R *a, R *b, int n)
47
/* compute the relative Linf error */
48
double e = 0.0, mag = 0.0;
51
for (i = 0; i < n; ++i) {
52
e = dmax(e, dabs(a[i] - b[i]));
53
mag = dmax(mag, dmin(dabs(a[i]), dabs(b[i])));
55
if (dabs(mag) < 1e-14 && dabs(e) < 1e-14)
61
BENCH_ASSERT(!isnan(e));
68
#define by2pi(m, n) ((K2PI * (m)) / (n))
71
* Improve accuracy by reducing x to range [0..1/8]
72
* before multiplication by 2 * PI.
75
static trigreal bench_sincos(trigreal m, trigreal n, int sinp)
77
/* waiting for C to get tail recursion... */
78
trigreal half_n = n * 0.5;
79
trigreal quarter_n = half_n * 0.5;
80
trigreal eighth_n = quarter_n * 0.5;
85
if (m < 0) { m = -m; /* goto cos; */ }
86
if (m > half_n) { m = n - m; goto cos; }
87
if (m > eighth_n) { m = quarter_n - m; goto sin; }
88
return sgn * COS(by2pi(m, n));
93
if (m < 0) { m = -m; goto msin; }
94
if (m > half_n) { m = n - m; goto msin; }
95
if (m > eighth_n) { m = quarter_n - m; goto cos; }
96
return sgn * SIN(by2pi(m, n));
99
static trigreal cos2pi(int m, int n)
101
return bench_sincos((trigreal)m, (trigreal)n, 0);
104
static trigreal sin2pi(int m, int n)
106
return bench_sincos((trigreal)m, (trigreal)n, 1);
109
static trigreal cos00(int i, int j, int n)
111
return cos2pi(i * j, n);
114
static trigreal cos01(int i, int j, int n)
116
return cos00(i, 2*j + 1, 2*n);
119
static trigreal cos10(int i, int j, int n)
121
return cos00(2*i + 1, j, 2*n);
124
static trigreal cos11(int i, int j, int n)
126
return cos00(2*i + 1, 2*j + 1, 4*n);
129
static trigreal sin00(int i, int j, int n)
131
return sin2pi(i * j, n);
134
static trigreal sin01(int i, int j, int n)
136
return sin00(i, 2*j + 1, 2*n);
139
static trigreal sin10(int i, int j, int n)
141
return sin00(2*i + 1, j, 2*n);
144
static trigreal sin11(int i, int j, int n)
146
return sin00(2*i + 1, 2*j + 1, 4*n);
149
static trigreal realhalf(int i, int j, int n)
158
static trigreal coshalf(int i, int j, int n)
161
return cos00(i, j, n);
163
return cos00(i, n - j, n);
166
static trigreal unity(int i, int j, int n)
174
typedef trigreal (*trigfun)(int, int, int);
176
static void rarand(R *a, int n)
180
/* generate random inputs */
181
for (i = 0; i < n; ++i) {
187
static void raadd(R *c, R *a, R *b, int n)
191
for (i = 0; i < n; ++i) {
197
static void rasub(R *c, R *a, R *b, int n)
201
for (i = 0; i < n; ++i) {
206
/* B = rotate left A + rotate right A */
207
static void rarolr(R *b, R *a, int n, int nb, int na,
210
int isL0 = 0, isL1 = 0, isR0 = 0, isR1 = 0;
213
for (ib = 0; ib < nb; ++ib) {
214
for (i = 0; i < n - 1; ++i)
215
for (ia = 0; ia < na; ++ia)
216
b[(ib * n + i) * na + ia] =
217
a[(ib * n + i + 1) * na + ia];
219
/* ugly switch to do boundary conditions for various r2r types */
221
/* periodic boundaries */
224
for (ia = 0; ia < na; ++ia) {
225
b[(ib * n + n - 1) * na + ia] =
226
a[(ib * n + 0) * na + ia];
227
b[(ib * n + 0) * na + ia] +=
228
a[(ib * n + n - 1) * na + ia];
232
case R2R_HC2R: /* ugh (hermitian halfcomplex boundaries) */
235
for (ia = 0; ia < na; ++ia) {
236
b[(ib * n + n - 1) * na + ia] = 0.0;
237
b[(ib * n + 0) * na + ia] +=
238
a[(ib * n + 1) * na + ia];
239
b[(ib * n + n/2) * na + ia] +=
240
+ a[(ib * n + n/2 - 1) * na + ia]
241
- a[(ib * n + n/2 + 1) * na + ia];
242
b[(ib * n + n/2 + 1) * na + ia] +=
243
- a[(ib * n + n/2) * na + ia];
246
for (ia = 0; ia < na; ++ia) {
247
b[(ib * n + n - 1) * na + ia] = 0.0;
248
b[(ib * n + 0) * na + ia] +=
249
a[(ib * n + 1) * na + ia];
250
b[(ib * n + n/2) * na + ia] +=
251
+ a[(ib * n + n/2) * na + ia]
252
- a[(ib * n + n/2 + 1) * na + ia];
253
b[(ib * n + n/2 + 1) * na + ia] +=
254
- a[(ib * n + n/2 + 1) * na + ia]
255
- a[(ib * n + n/2) * na + ia];
257
} else /* n <= 2 */ {
258
for (ia = 0; ia < na; ++ia) {
259
b[(ib * n + n - 1) * na + ia] =
260
a[(ib * n + 0) * na + ia];
261
b[(ib * n + 0) * na + ia] +=
262
a[(ib * n + n - 1) * na + ia];
267
/* various even/odd boundary conditions */
296
for (ia = 0; ia < na; ++ia)
297
b[(ib * n + n - 1) * na + ia] =
298
isR0 * a[(ib * n + n - 1) * na + ia]
299
+ (n > 1 ? isR1 * a[(ib * n + n - 2) * na + ia]
302
for (ia = 0; ia < na; ++ia)
303
b[(ib * n) * na + ia] +=
304
isL0 * a[(ib * n) * na + ia]
305
+ (n > 1 ? isL1 * a[(ib * n + 1) * na + ia] : 0);
309
for (i = 1; i < n; ++i)
310
for (ia = 0; ia < na; ++ia)
311
b[(ib * n + i) * na + ia] +=
312
a[(ib * n + i - 1) * na + ia];
316
static void raphase_shift(R *b, R *a, int n, int nb, int na,
317
int n0, int k0, trigfun t)
321
for (jb = 0; jb < nb; ++jb)
322
for (j = 0; j < n; ++j) {
323
trigreal c = 2.0 * t(1, j + k0, n0);
325
for (ja = 0; ja < na; ++ja) {
326
int k = (jb * n + j) * na + ja;
332
/* A = alpha * A (real, in place) */
333
static void rascale(R *a, R alpha, int n)
337
for (i = 0; i < n; ++i) {
346
/* copy real A into real B, using output stride of A and input stride of B */
353
static void cpyr0(dotens2_closure *k_,
354
int indxa, int ondxa, int indxb, int ondxb)
356
cpyr_closure *k = (cpyr_closure *)k_;
357
k->rb[indxb] = k->ra[ondxa];
358
UNUSED(indxa); UNUSED(ondxb);
361
static void cpyr(R *ra, bench_tensor *sza, R *rb, bench_tensor *szb)
365
k.ra = ra; k.rb = rb;
366
bench_dotens2(sza, szb, &k.k);
369
static void dofft(info *nfo, R *in, R *out)
371
cpyr(in, nfo->pckdsz, (R *) nfo->p->in, nfo->totalsz);
373
cpyr((R *) nfo->p->out, nfo->totalsz, out, nfo->pckdsz);
376
static double racmp(R *a, R *b, int n, const char *test, double tol)
378
double d = raerror(a, b, n);
380
fprintf(stderr, "Found relative error %e (%s)\n", d, test);
383
for (i = 0; i < n; ++i)
384
fprintf(stderr, "%8d %16.12f %16.12f\n", i,
393
/***********************************************************************/
396
int n; /* physical size */
397
int n0; /* "logical" transform size */
398
int i0, k0; /* shifts of input/output */
399
trigfun ti, ts; /* impulse/shift trig functions */
402
static void impulse_response(int rnk, dim_stuff *d, R impulse_amp,
410
for (i = 0; i < d->n; ++i) {
411
impulse_response(rnk - 1, d + 1,
412
impulse_amp * d->ti(d->i0, d->k0 + i, d->n0),
418
/***************************************************************************/
421
* Implementation of the FFT tester described in
423
* Funda Erg�n. Testing multivariate linear functions: Overcoming the
424
* generator bottleneck. In Proceedings of the Twenty-Seventh Annual
425
* ACM Symposium on the Theory of Computing, pages 407-416, Las Vegas,
426
* Nevada, 29 May--1 June 1995.
428
* Also: F. Ergun, S. R. Kumar, and D. Sivakumar, "Self-testing without
429
* the generator bottleneck," SIAM J. on Computing 29 (5), 1630-51 (2000).
432
static double rlinear(int n, info *nfo, R *inA, R *inB, R *inC, R *outA,
433
R *outB, R *outC, R *tmp, int rounds, double tol)
438
for (j = 0; j < rounds; ++j) {
444
dofft(nfo, inA, outA);
445
dofft(nfo, inB, outB);
447
rascale(outA, alpha, n);
448
rascale(outB, beta, n);
449
raadd(tmp, outA, outB, n);
450
rascale(inA, alpha, n);
451
rascale(inB, beta, n);
452
raadd(inC, inA, inB, n);
453
dofft(nfo, inC, outC);
455
e = dmax(e, racmp(outC, tmp, n, "linear", tol));
460
static double rimpulse(dim_stuff *d, R impulse_amp,
461
int n, int vecn, info *nfo,
462
R *inA, R *inB, R *inC,
463
R *outA, R *outB, R *outC,
464
R *tmp, int rounds, double tol)
471
/* test 2: check that the unit impulse is transformed properly */
473
for (i = 0; i < N; ++i) {
477
for (i = 0; i < vecn; ++i) {
478
inA[i * n] = (i+1) / (double)(vecn+1);
480
/* transform of the pls */
481
impulse_response(nfo->probsz->rnk, d, impulse_amp * inA[i * n],
485
dofft(nfo, inA, tmp);
486
e = dmax(e, racmp(tmp, outA, N, "impulse 1", tol));
488
for (j = 0; j < rounds; ++j) {
490
rasub(inC, inA, inB, N);
491
dofft(nfo, inB, outB);
492
dofft(nfo, inC, outC);
493
raadd(tmp, outB, outC, N);
494
e = dmax(e, racmp(tmp, outA, N, "impulse", tol));
499
static double t_shift(int n, int vecn, info *nfo,
500
R *inA, R *inB, R *outA, R *outB, R *tmp,
501
int rounds, double tol,
505
int nb, na, dim, N = n * vecn;
507
bench_tensor *sz = nfo->probsz;
509
/* test 3: check the time-shift property */
510
/* the paper performs more tests, but this code should be fine too */
515
/* check shifts across all SZ dimensions */
516
for (dim = 0; dim < sz->rnk; ++dim) {
517
int ncur = sz->dims[dim].n;
521
for (j = 0; j < rounds; ++j) {
524
for (i = 0; i < vecn; ++i) {
525
rarolr(inB + i * n, inA + i*n, ncur, nb,na,
528
dofft(nfo, inA, outA);
529
dofft(nfo, inB, outB);
530
for (i = 0; i < vecn; ++i)
531
raphase_shift(tmp + i * n, outA + i * n, ncur,
532
nb, na, d[dim].n0, d[dim].k0, d[dim].ts);
533
e = dmax(e, racmp(tmp, outB, N, "time shift", tol));
541
/***********************************************************************/
543
void verify_r2r(bench_problem *p, int rounds, double tol, errors *e)
545
R *inA, *inB, *inC, *outA, *outB, *outC, *tmp;
548
double impulse_amp = 1.0;
553
rounds = 20; /* default value */
555
n = tensor_sz(p->sz);
556
vecn = tensor_sz(p->vecsz);
559
d = (dim_stuff *) bench_malloc(sizeof(dim_stuff) * p->sz->rnk);
560
for (i = 0; i < p->sz->rnk; ++i) {
564
d[i].n = n0 = p->sz->dims[i].n;
565
if (p->k[i] > R2R_DHT)
566
n0 = 2 * (n0 + (p->k[i] == R2R_REDFT00 ? -1 :
567
(p->k[i] == R2R_RODFT00 ? 1 : 0)));
595
ti = cos10; impulse_amp *= 2.0;
600
ti = cos11; impulse_amp *= 2.0;
605
ti = sin00; impulse_amp *= 2.0;
610
ti = sin01; impulse_amp *= n == 1 ? 1.0 : 2.0;
615
ti = sin10; impulse_amp *= 2.0;
620
ti = sin11; impulse_amp *= 2.0;
636
inA = (R *) bench_malloc(N * sizeof(R));
637
inB = (R *) bench_malloc(N * sizeof(R));
638
inC = (R *) bench_malloc(N * sizeof(R));
639
outA = (R *) bench_malloc(N * sizeof(R));
640
outB = (R *) bench_malloc(N * sizeof(R));
641
outC = (R *) bench_malloc(N * sizeof(R));
642
tmp = (R *) bench_malloc(N * sizeof(R));
646
nfo.totalsz = tensor_append(p->vecsz, nfo.probsz);
647
nfo.pckdsz = verify_pack(nfo.totalsz, 1);
648
nfo.pckdvecsz = verify_pack(p->vecsz, tensor_sz(nfo.probsz));
650
e->i = rimpulse(d, impulse_amp, n, vecn, &nfo,
651
inA, inB, inC, outA, outB, outC, tmp, rounds, tol);
652
e->l = rlinear(N, &nfo, inA, inB, inC, outA, outB, outC, tmp, rounds,tol);
653
e->s = t_shift(n, vecn, &nfo, inA, inB, outA, outB, tmp,
656
/* grr, verify-lib.c:preserves_input() only works for complex */
657
if (!p->in_place && !p->destroy_input) {
658
bench_tensor *totalsz_swap, *pckdsz_swap;
659
totalsz_swap = tensor_copy_swapio(nfo.totalsz);
660
pckdsz_swap = tensor_copy_swapio(nfo.pckdsz);
662
for (i = 0; i < rounds; ++i) {
664
dofft(&nfo, inA, outB);
665
cpyr((R *) nfo.p->in, totalsz_swap, inB, pckdsz_swap);
666
racmp(inB, inA, N, "preserves_input", 0.0);
669
tensor_destroy(totalsz_swap);
670
tensor_destroy(pckdsz_swap);
673
tensor_destroy(nfo.totalsz);
674
tensor_destroy(nfo.pckdsz);
675
tensor_destroy(nfo.pckdvecsz);
693
static void cpyr1(int n, R *in, int is, R *out, int os, R scale)
696
for (i = 0; i < n; ++i)
697
out[i * os] = in[i * is] * scale;
700
static void mke00(C *a, int n, int c)
703
for (i = 1; i + i < n; ++i)
704
a[n - i][c] = a[i][c];
707
static void mkre00(C *a, int n)
713
static void mkimag(C *a, int n)
716
for (i = 0; i < n; ++i)
720
static void mko00(C *a, int n, int c)
724
for (i = 1; i + i < n; ++i)
725
a[n - i][c] = -a[i][c];
730
static void mkro00(C *a, int n)
736
static void mkio00(C *a, int n)
742
static void mkre01(C *a, int n) /* n should be be multiple of 4 */
747
c_re(a[n/2]) = -(c_re(a[0]) = a0);
751
static void mkro01(C *a, int n) /* n should be be multiple of 4 */
753
c_re(a[0]) = c_im(a[0]) = 0.0;
758
static void mkoddonly(C *a, int n)
761
for (i = 0; i < n; i += 2)
762
c_re(a[i]) = c_im(a[i]) = 0.0;
765
static void mkre10(C *a, int n)
771
static void mkio10(C *a, int n)
777
static void mkre11(C *a, int n)
784
static void mkro11(C *a, int n)
791
static void mkio11(C *a, int n)
798
static void r2r_apply(dofft_closure *k_, bench_complex *in, bench_complex *out)
800
dofft_r2r_closure *k = (dofft_r2r_closure *)k_;
801
bench_problem *p = k->p;
805
n = p->sz->dims[0].n;
806
is = p->sz->dims[0].is;
807
os = p->sz->dims[0].os;
809
ri = (bench_real *) p->in;
810
ro = (bench_real *) p->out;
814
cpyr1(n, &c_re(in[0]), 2, ri, is, 1.0);
817
cpyr1(n/2 + 1, &c_re(in[0]), 2, ri, is, 1.0);
818
cpyr1((n+1)/2 - 1, &c_im(in[n-1]), -2, ri + is*(n-1), -is, 1.0);
821
cpyr1(n, &c_re(in[0]), 2, ri, is, 1.0);
824
cpyr1(n, &c_re(in[1]), 2, ri, is, 1.0);
827
cpyr1(n, &c_re(in[0]), 2, ri, is, 1.0);
830
cpyr1(n, &c_re(in[1]), 4, ri, is, 1.0);
833
cpyr1(n, &c_re(in[1]), 2, ri, is, 1.0);
836
cpyr1(n, &c_im(in[1]), 4, ri, is, 1.0);
839
cpyr1(n, &c_re(in[1]), 4, ri, is, 1.0);
842
cpyr1(n, &c_re(in[1]), 4, ri, is, 1.0);
845
BENCH_ASSERT(0); /* not yet implemented */
852
if (k->k.recopy_input)
853
cpyr1(n, ri, is, &c_re(in[0]), 2, 1.0);
854
cpyr1(n/2 + 1, ro, os, &c_re(out[0]), 2, 1.0);
855
cpyr1((n+1)/2 - 1, ro + os*(n-1), -os, &c_im(out[1]), 2, 1.0);
858
c_im(out[n/2]) = 0.0;
859
mkhermitian1(out, n);
862
if (k->k.recopy_input) {
863
cpyr1(n/2 + 1, ri, is, &c_re(in[0]), 2, 1.0);
864
cpyr1((n+1)/2 - 1, ri + is*(n-1), -is, &c_im(in[1]), 2,1.0);
866
cpyr1(n, ro, os, &c_re(out[0]), 2, 1.0);
870
if (k->k.recopy_input)
871
cpyr1(n, ri, is, &c_re(in[0]), 2, 1.0);
872
cpyr1(n, ro, os, &c_re(out[0]), 2, 1.0);
876
if (k->k.recopy_input)
877
cpyr1(n, ri, is, &c_im(in[1]), 2, -1.0);
878
cpyr1(n, ro, os, &c_im(out[1]), 2, -1.0);
882
if (k->k.recopy_input)
883
cpyr1(n, ri, is, &c_re(in[0]), 2, 1.0);
884
cpyr1(n, ro, os, &c_re(out[1]), 4, 2.0);
888
if (k->k.recopy_input)
889
cpyr1(n, ri, is, &c_re(in[1]), 4, 2.0);
890
cpyr1(n, ro, os, &c_re(out[0]), 2, 1.0);
894
if (k->k.recopy_input)
895
cpyr1(n, ri, is, &c_re(in[1]), 2, 1.0);
896
cpyr1(n, ro, os, &c_im(out[1]), 4, -2.0);
900
if (k->k.recopy_input)
901
cpyr1(n, ri, is, &c_im(in[1]), 4, -2.0);
902
cpyr1(n, ro, os, &c_re(out[1]), 2, 1.0);
906
if (k->k.recopy_input)
907
cpyr1(n, ri, is, &c_re(in[1]), 4, 2.0);
908
cpyr1(n, ro, os, &c_re(out[1]), 4, 2.0);
912
if (k->k.recopy_input)
913
cpyr1(n, ri, is, &c_im(in[1]), 4, -2.0);
914
cpyr1(n, ro, os, &c_im(out[1]), 4, -2.0);
918
BENCH_ASSERT(0); /* not yet implemented */
922
void accuracy_r2r(bench_problem *p, int rounds, int impulse_rounds,
928
aconstrain constrain = 0;
930
BENCH_ASSERT(p->kind == PROBLEM_R2R);
931
BENCH_ASSERT(p->sz->rnk == 1);
932
BENCH_ASSERT(p->vecsz->rnk == 0);
934
k.k.apply = r2r_apply;
935
k.k.recopy_input = 0;
937
n = tensor_sz(p->sz);
940
case R2R_R2HC: constrain = mkreal; n0 = n; break;
941
case R2R_HC2R: constrain = mkhermitian1; n0 = n; break;
942
case R2R_REDFT00: constrain = mkre00; n0 = 2*(n-1); break;
943
case R2R_RODFT00: constrain = mkro00; n0 = 2*(n+1); break;
944
case R2R_REDFT01: constrain = mkre01; n0 = 4*n; break;
945
case R2R_REDFT10: constrain = mkre10; n0 = 4*n; break;
946
case R2R_RODFT01: constrain = mkro01; n0 = 4*n; break;
947
case R2R_RODFT10: constrain = mkio10; n0 = 4*n; break;
948
case R2R_REDFT11: constrain = mkre11; n0 = 8*n; break;
949
case R2R_RODFT11: constrain = mkro11; n0 = 8*n; break;
950
default: BENCH_ASSERT(0); /* not yet implemented */
954
a = (C *) bench_malloc(n0 * sizeof(C));
955
b = (C *) bench_malloc(n0 * sizeof(C));
956
accuracy_test(&k.k, constrain, -1, n0, a, b, rounds, impulse_rounds, t);