2
* Copyright (c) 2003, 2006 Matteo Frigo
3
* Copyright (c) 2003, 2006 Massachusetts Institute of Technology
5
* This program is free software; you can redistribute it and/or modify
6
* it under the terms of the GNU General Public License as published by
7
* the Free Software Foundation; either version 2 of the License, or
8
* (at your option) any later version.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21
/* $Id: ifftw.h,v 1.281 2006-02-10 14:18:39 athena Exp $ */
23
/* FFTW internal header file */
29
#include <stdlib.h> /* size_t */
30
#include <stdarg.h> /* va_list */
31
#include <stddef.h> /* ptrdiff_t */
34
# include <sys/types.h>
38
#if defined(_WIN32) && !defined(FREE_WINDOWS)
40
#ifndef _INTPTR_T_DEFINED
42
typedef __int64 intptr_t;
44
typedef long intptr_t;
46
#define _INTPTR_T_DEFINED
49
#ifndef _UINTPTR_T_DEFINED
51
typedef unsigned __int64 uintptr_t;
53
typedef unsigned long uintptr_t;
55
#define _UINTPTR_T_DEFINED
58
#elif defined(__linux__)
60
/* Linux-i386, Linux-Alpha, Linux-ppc */
63
#elif defined (__APPLE__)
67
#elif defined(FREE_WINDOWS)
73
/* FreeBSD, Irix, Solaris */
74
#include <sys/types.h>
76
#endif /* ifdef platform for types */
79
/* Windows annoyances -- since tests/hook.c uses some internal
80
FFTW functions, we need to given them the dllexport attribute
81
under Windows when compiling as a DLL (see api/fftw3.h). */
82
#if defined(FFTW_EXTERN)
83
# define IFFTW_EXTERN FFTW_EXTERN
84
#elif (defined(FFTW_DLL) || defined(DLL_EXPORT)) \
85
&& (defined(_WIN32) || defined(__WIN32__))
86
# define IFFTW_EXTERN extern __declspec(dllexport)
88
# define IFFTW_EXTERN extern
91
/* determine precision and name-mangling scheme */
92
#define CONCAT(prefix, name) prefix ## name
93
#if defined(FFTW_SINGLE)
95
# define X(name) CONCAT(fftwf_, name)
96
#elif defined(FFTW_LDOUBLE)
97
typedef long double R;
98
# define X(name) CONCAT(fftwl_, name)
99
# define TRIGREAL_IS_LONG_DOUBLE
102
# define X(name) CONCAT(fftw_, name)
106
integral type large enough to contain a stride (what ``int'' should
107
have been in the first place.
109
typedef ptrdiff_t INT;
111
/* dummy use of unused parameters to silence compiler warnings */
112
#define UNUSED(x) (void)x
114
#define FFT_SIGN (-1) /* sign convention for forward transforms */
116
#define REGISTER_SOLVER(p, s) X(solver_register)(p, s)
118
#define STRINGIZEx(x) #x
119
#define STRINGIZE(x) STRINGIZEx(x)
125
#if defined(HAVE_SSE) || defined(HAVE_SSE2) || defined(HAVE_ALTIVEC)
131
/* forward declarations */
132
typedef struct problem_s problem;
133
typedef struct plan_s plan;
134
typedef struct solver_s solver;
135
typedef struct planner_s planner;
136
typedef struct printer_s printer;
137
typedef struct scanner_s scanner;
139
/*-----------------------------------------------------------------------*/
142
#define MIN_ALIGNMENT 16
146
/* use alloca if available */
150
# define alloca __builtin_alloca
154
# define alloca _alloca
162
# ifndef alloca /* predefined by HP cc +Olibcalls */
163
void *alloca(size_t);
171
# ifdef MIN_ALIGNMENT
172
# define STACK_MALLOC(T, p, x) \
174
p = (T)alloca((x) + MIN_ALIGNMENT); \
175
p = (T)(((uintptr_t)p + (MIN_ALIGNMENT - 1)) & \
176
(~(uintptr_t)(MIN_ALIGNMENT - 1))); \
178
# define STACK_FREE(x)
179
# else /* HAVE_ALLOCA && !defined(MIN_ALIGNMENT) */
180
# define STACK_MALLOC(T, p, x) p = (T)alloca(x)
181
# define STACK_FREE(x)
184
#else /* ! HAVE_ALLOCA */
185
/* use malloc instead of alloca */
186
# define STACK_MALLOC(T, p, x) p = (T)MALLOC(x, OTHER)
187
# define STACK_FREE(x) X(ifree)(x)
188
#endif /* ! HAVE_ALLOCA */
190
/*-----------------------------------------------------------------------*/
191
/* define uintptr_t if it is not already defined */
193
#ifndef HAVE_UINTPTR_T
194
# if SIZEOF_VOID_P == 0
195
# error sizeof void* is unknown!
196
# elif SIZEOF_UNSIGNED_INT == SIZEOF_VOID_P
197
typedef unsigned int uintptr_t;
198
# elif SIZEOF_UNSIGNED_LONG == SIZEOF_VOID_P
199
typedef unsigned long uintptr_t;
200
# elif SIZEOF_UNSIGNED_LONG_LONG == SIZEOF_VOID_P
201
typedef unsigned long long uintptr_t;
203
# error no unsigned integer type matches void* sizeof!
207
/*-----------------------------------------------------------------------*/
208
/* We can do an optimization for copying pairs of (aligned) floats
209
when in single precision if 2*float = double. */
211
#define FFTW_2R_IS_DOUBLE (defined(FFTW_SINGLE) \
212
&& SIZEOF_FLOAT != 0 \
213
&& SIZEOF_DOUBLE == 2*SIZEOF_FLOAT)
215
#define DOUBLE_ALIGNED(p) ((((uintptr_t)(p)) % sizeof(double)) == 0)
217
/*-----------------------------------------------------------------------*/
219
IFFTW_EXTERN void X(assertion_failed)(const char *s,
220
int line, const char *file);
224
(void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
227
/* check only if debug enabled */
229
(void)((ex) || (X(assertion_failed)(#ex, __LINE__, __FILE__), 0))
231
#define A(ex) /* nothing */
234
extern void X(debug)(const char *format, ...);
237
/*-----------------------------------------------------------------------*/
239
extern void *X(kernel_malloc)(size_t n);
240
extern void X(kernel_free)(void *p);
242
/*-----------------------------------------------------------------------*/
245
/* objects allocated by malloc, for statistical purposes */
259
MALLOC_WHAT_LAST /* must be last */
262
IFFTW_EXTERN void X(ifree)(void *ptr);
263
extern void X(ifree0)(void *ptr);
265
#ifdef FFTW_DEBUG_MALLOC
267
IFFTW_EXTERN void *X(malloc_debug)(size_t n, enum malloc_tag what,
268
const char *file, int line);
269
#define MALLOC(n, what) X(malloc_debug)(n, what, __FILE__, __LINE__)
270
#define NATIVE_MALLOC(n, what) MALLOC(n, what)
271
IFFTW_EXTERN void X(malloc_print_minfo)(int vrbose);
273
#else /* ! FFTW_DEBUG_MALLOC */
275
IFFTW_EXTERN void *X(malloc_plain)(size_t sz);
276
#define MALLOC(n, what) X(malloc_plain)(n)
277
#define NATIVE_MALLOC(n, what) malloc(n)
281
#if defined(FFTW_DEBUG) && defined(FFTW_DEBUG_MALLOC) && defined(HAVE_THREADS)
282
extern int X(in_thread);
283
# define IN_THREAD X(in_thread)
284
# define THREAD_ON { int in_thread_save = X(in_thread); X(in_thread) = 1
285
# define THREAD_OFF X(in_thread) = in_thread_save; }
292
/*-----------------------------------------------------------------------*/
293
/* low-resolution clock */
295
#if TIME_WITH_SYS_TIME
296
# include <sys/time.h>
300
# include <sys/time.h>
306
#ifdef HAVE_BSDGETTIMEOFDAY
307
#ifndef HAVE_GETTIMEOFDAY
308
#define gettimeofday BSDgettimeofday
309
#define HAVE_GETTIMEOFDAY 1
313
#if defined(HAVE_GETTIMEOFDAY)
314
typedef struct timeval crude_time;
316
typedef clock_t crude_time;
319
crude_time X(get_crude_time)(void);
320
double X(elapsed_since)(crude_time t0); /* time in seconds since t0 */
322
/*-----------------------------------------------------------------------*/
325
* ops counter. The total number of additions is add + fma
326
* and the total number of multiplications is mul + fma.
327
* Total flops = add + mul + 2 * fma
336
void X(ops_zero)(opcnt *dst);
337
void X(ops_other)(INT o, opcnt *dst);
338
void X(ops_cpy)(const opcnt *src, opcnt *dst);
340
void X(ops_add)(const opcnt *a, const opcnt *b, opcnt *dst);
341
void X(ops_add2)(const opcnt *a, opcnt *dst);
343
/* dst = m * a + b */
344
void X(ops_madd)(INT m, const opcnt *a, const opcnt *b, opcnt *dst);
347
void X(ops_madd2)(INT m, const opcnt *a, opcnt *dst);
350
/*-----------------------------------------------------------------------*/
352
INT X(imax)(INT a, INT b);
353
INT X(imin)(INT a, INT b);
355
/*-----------------------------------------------------------------------*/
360
#define IABS(x) (((x) < 0) ? (0 - (x)) : (x))
362
/*-----------------------------------------------------------------------*/
365
#if SIZEOF_UNSIGNED_INT >= 4
366
typedef unsigned int md5uint;
368
typedef unsigned long md5uint; /* at least 32 bits as per C standard */
371
typedef md5uint md5sig[4];
374
md5sig s; /* state and signature */
376
/* fields not meant to be used outside md5.c: */
377
unsigned char c[64]; /* stuff not yet processed */
378
unsigned l; /* total length. Should be 64 bits long, but this is
379
good enough for us */
382
void X(md5begin)(md5 *p);
383
void X(md5putb)(md5 *p, const void *d_, size_t len);
384
void X(md5puts)(md5 *p, const char *s);
385
void X(md5putc)(md5 *p, unsigned char c);
386
void X(md5int)(md5 *p, int i);
387
void X(md5INT)(md5 *p, INT i);
388
void X(md5unsigned)(md5 *p, unsigned i);
389
void X(md5end)(md5 *p);
391
/*-----------------------------------------------------------------------*/
393
#define STRUCT_HACK_KR
394
#undef STRUCT_HACK_C99
398
INT is; /* input stride */
399
INT os; /* output stride */
404
#if defined(STRUCT_HACK_KR)
406
#elif defined(STRUCT_HACK_C99)
414
Definition of rank -infinity.
415
This definition has the property that if you want rank 0 or 1,
416
you can simply test for rank <= 1. This is a common case.
418
A tensor of rank -infinity has size 0.
420
#define RNK_MINFTY ((int)(((unsigned) -1) >> 1))
421
#define FINITE_RNK(rnk) ((rnk) != RNK_MINFTY)
423
typedef enum { INPLACE_IS, INPLACE_OS } inplace_kind;
425
tensor *X(mktensor)(int rnk);
426
tensor *X(mktensor_0d)(void);
427
tensor *X(mktensor_1d)(INT n, INT is, INT os);
428
tensor *X(mktensor_2d)(INT n0, INT is0, INT os0,
429
INT n1, INT is1, INT os1);
430
tensor *X(mktensor_3d)(INT n0, INT is0, INT os0,
431
INT n1, INT is1, INT os1,
432
INT n2, INT is2, INT os2);
433
INT X(tensor_sz)(const tensor *sz);
434
void X(tensor_md5)(md5 *p, const tensor *t);
435
INT X(tensor_max_index)(const tensor *sz);
436
INT X(tensor_min_istride)(const tensor *sz);
437
INT X(tensor_min_ostride)(const tensor *sz);
438
INT X(tensor_min_stride)(const tensor *sz);
439
int X(tensor_inplace_strides)(const tensor *sz);
440
int X(tensor_inplace_strides2)(const tensor *a, const tensor *b);
441
int X(tensor_strides_decrease)(const tensor *sz, const tensor *vecsz,
443
tensor *X(tensor_copy)(const tensor *sz);
444
int X(tensor_kosherp)(const tensor *x);
446
tensor *X(tensor_copy_inplace)(const tensor *sz, inplace_kind k);
447
tensor *X(tensor_copy_except)(const tensor *sz, int except_dim);
448
tensor *X(tensor_copy_sub)(const tensor *sz, int start_dim, int rnk);
449
tensor *X(tensor_compress)(const tensor *sz);
450
tensor *X(tensor_compress_contiguous)(const tensor *sz);
451
tensor *X(tensor_append)(const tensor *a, const tensor *b);
452
void X(tensor_split)(const tensor *sz, tensor **a, int a_rnk, tensor **b);
453
int X(tensor_tornk1)(const tensor *t, INT *n, INT *is, INT *os);
454
void X(tensor_destroy)(tensor *sz);
455
void X(tensor_destroy2)(tensor *a, tensor *b);
456
void X(tensor_destroy4)(tensor *a, tensor *b, tensor *c, tensor *d);
457
void X(tensor_print)(const tensor *sz, printer *p);
458
int X(dimcmp)(const iodim *a, const iodim *b);
460
/*-----------------------------------------------------------------------*/
471
void (*hash) (const problem *ego, md5 *p);
472
void (*zero) (const problem *ego);
473
void (*print) (problem *ego, printer *p);
474
void (*destroy) (problem *ego);
478
const problem_adt *adt;
481
problem *X(mkproblem)(size_t sz, const problem_adt *adt);
482
void X(problem_destroy)(problem *ego);
484
/*-----------------------------------------------------------------------*/
487
void (*print)(printer *p, const char *format, ...);
488
void (*vprint)(printer *p, const char *format, va_list ap);
489
void (*putchr)(printer *p, char c);
490
void (*cleanup)(printer *p);
495
printer *X(mkprinter)(size_t size,
496
void (*putchr)(printer *p, char c),
497
void (*cleanup)(printer *p));
498
IFFTW_EXTERN void X(printer_destroy)(printer *p);
500
/*-----------------------------------------------------------------------*/
503
int (*scan)(scanner *sc, const char *format, ...);
504
int (*vscan)(scanner *sc, const char *format, va_list ap);
505
int (*getchr)(scanner *sc);
509
scanner *X(mkscanner)(size_t size, int (*getchr)(scanner *sc));
510
void X(scanner_destroy)(scanner *sc);
512
/*-----------------------------------------------------------------------*/
523
void (*solve)(const plan *ego, const problem *p);
524
void (*awake)(plan *ego, enum wakefulness wakefulness);
525
void (*print)(const plan *ego, printer *p);
526
void (*destroy)(plan *ego);
533
enum wakefulness wakefulness; /* used for debugging only */
534
int could_prune_now_p;
537
plan *X(mkplan)(size_t size, const plan_adt *adt);
538
void X(plan_destroy_internal)(plan *ego);
539
IFFTW_EXTERN void X(plan_awake)(plan *ego, enum wakefulness wakefulness);
540
void X(plan_null_destroy)(plan *ego);
542
/*-----------------------------------------------------------------------*/
546
plan *(*mkplan)(const solver *ego, const problem *p, planner *plnr);
550
const solver_adt *adt;
554
solver *X(mksolver)(size_t size, const solver_adt *adt);
555
void X(solver_use)(solver *ego);
556
void X(solver_destroy)(solver *ego);
557
void X(solver_register)(planner *plnr, solver *s);
560
#define MKSOLVER(type, adt) (type *)X(mksolver)(sizeof(type), adt)
562
/*-----------------------------------------------------------------------*/
565
typedef struct slvdesc_s {
570
int next_for_same_problem_kind;
573
typedef struct solution_s solution; /* opaque */
575
/* interpretation of L and U:
577
- if it returns a plan, the planner guarantees that all applicable
578
plans at least as impatient as U have been tried, and that each
579
plan in the solution is at least as impatient as L.
581
- if it returns 0, the planner guarantees to have tried all solvers
582
at least as impatient as L, and that none of them was applicable.
584
The structure is packed to fit into 64 bits.
589
unsigned hash_info:3;
590
# define BITS_FOR_TIMELIMIT 9
591
unsigned timelimit_impatience:BITS_FOR_TIMELIMIT;
594
/* abstraction break: we store the solver here to pad the
595
structure to 64 bits. Otherwise, the struct is padded to 64
596
bits anyway, and another word is allocated for slvndx. */
597
# define BITS_FOR_SLVNDX 12
598
unsigned slvndx:BITS_FOR_SLVNDX;
601
/* impatience flags */
603
BELIEVE_PCOST = 0x0001,
605
NO_DFT_R2HC = 0x0004,
607
NO_VRECURSE = 0x0010,
608
NO_INDIRECT_OP = 0x0020,
609
NO_LARGE_GENERIC = 0x0040,
610
NO_RANK_SPLITS = 0x0080,
611
NO_VRANK_SPLITS = 0x0100,
612
NO_NONTHREADED = 0x0200,
613
NO_BUFFERING = 0x0400,
614
NO_FIXED_RADIX_LARGE_N = 0x0800,
615
NO_DESTROY_INPUT = 0x1000,
617
CONSERVE_MEMORY = 0x4000,
618
NO_DHT_R2HC = 0x8000,
620
ALLOW_PRUNING = 0x20000
623
/* hashtable information */
625
BLESSING = 0x1, /* save this entry */
626
H_VALID = 0x2, /* valid hastable entry */
627
H_LIVE = 0x4 /* entry is nonempty, implies H_VALID */
630
#define PLNR_L(plnr) ((plnr)->flags.l)
631
#define PLNR_U(plnr) ((plnr)->flags.u)
632
#define PLNR_TIMELIMIT_IMPATIENCE(plnr) ((plnr)->flags.timelimit_impatience)
634
#define ESTIMATEP(plnr) (PLNR_U(plnr) & ESTIMATE)
635
#define BELIEVE_PCOSTP(plnr) (PLNR_U(plnr) & BELIEVE_PCOST)
636
#define ALLOW_PRUNINGP(plnr) (PLNR_U(plnr) & ALLOW_PRUNING)
638
#define NO_INDIRECT_OP_P(plnr) (PLNR_L(plnr) & NO_INDIRECT_OP)
639
#define NO_LARGE_GENERICP(plnr) (PLNR_L(plnr) & NO_LARGE_GENERIC)
640
#define NO_RANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_RANK_SPLITS)
641
#define NO_VRANK_SPLITSP(plnr) (PLNR_L(plnr) & NO_VRANK_SPLITS)
642
#define NO_VRECURSEP(plnr) (PLNR_L(plnr) & NO_VRECURSE)
643
#define NO_DFT_R2HCP(plnr) (PLNR_L(plnr) & NO_DFT_R2HC)
644
#define NO_SLOWP(plnr) (PLNR_L(plnr) & NO_SLOW)
645
#define NO_UGLYP(plnr) (PLNR_L(plnr) & NO_UGLY)
646
#define NO_FIXED_RADIX_LARGE_NP(plnr) \
647
(PLNR_L(plnr) & NO_FIXED_RADIX_LARGE_N)
648
#define NO_NONTHREADEDP(plnr) \
649
((PLNR_L(plnr) & NO_NONTHREADED) && (plnr)->nthr > 1)
651
#define NO_DESTROY_INPUTP(plnr) (PLNR_L(plnr) & NO_DESTROY_INPUT)
652
#define NO_SIMDP(plnr) (PLNR_L(plnr) & NO_SIMD)
653
#define CONSERVE_MEMORYP(plnr) (PLNR_L(plnr) & CONSERVE_MEMORY)
654
#define NO_DHT_R2HCP(plnr) (PLNR_L(plnr) & NO_DHT_R2HC)
655
#define NO_BUFFERINGP(plnr) (PLNR_L(plnr) & NO_BUFFERING)
657
typedef enum { FORGET_ACCURSED, FORGET_EVERYTHING } amnesia;
660
/* WISDOM_NORMAL: planner may or may not use wisdom */
663
/* WISDOM_ONLY: planner must use wisdom and must avoid searching */
666
/* WISDOM_IS_BOGUS: planner must return 0 as quickly as possible */
669
/* WISDOM_IGNORE_INFEASIBLE: planner ignores infeasible wisdom */
670
WISDOM_IGNORE_INFEASIBLE,
672
/* WISDOM_IGNORE_ALL: planner ignores all */
677
void (*register_solver)(planner *ego, solver *s);
678
plan *(*mkplan)(planner *ego, problem *p);
679
void (*forget)(planner *ego, amnesia a);
680
void (*exprt)(planner *ego, printer *p); /* ``export'' is a reserved
682
int (*imprt)(planner *ego, scanner *sc);
685
/* hash table of solutions */
688
unsigned hashsiz, nelem;
691
int lookup, succ_lookup, lookup_iter;
692
int insert, insert_iter, insert_unknown;
697
const planner_adt *adt;
698
void (*hook)(struct planner_s *plnr, plan *pln,
699
const problem *p, int optimalp);
701
/* solver descriptors */
703
unsigned nslvdesc, slvdescsiz;
704
const char *cur_reg_nam;
706
int slvdescs_for_problem_kind[PROBLEM_LAST];
708
wisdom_state_t wisdom_state;
710
hashtab htab_blessed;
711
hashtab htab_unblessed;
716
crude_time start_time;
717
double timelimit; /* elapsed_since(start_time) at which to bail out */
718
int timed_out; /* whether most recent search timed out */
719
int need_timeout_check;
721
/* various statistics */
722
int nplan; /* number of plans evaluated */
723
double pcost, epcost; /* total pcost of measured/estimated plans */
724
int nprob; /* number of problems evaluated */
727
planner *X(mkplanner)(void);
728
void X(planner_destroy)(planner *ego);
731
Iterate over all solvers. Read:
733
@article{ baker93iterators,
734
author = "Henry G. Baker, Jr.",
735
title = "Iterators: Signs of Weakness in Object-Oriented Languages",
736
journal = "{ACM} {OOPS} Messenger",
742
#define FORALL_SOLVERS(ego, s, p, what) \
745
for (_cnt = 0; _cnt < ego->nslvdesc; ++_cnt) { \
746
slvdesc *p = ego->slvdescs + _cnt; \
747
solver *s = p->slv; \
752
#define FORALL_SOLVERS_OF_KIND(kind, ego, s, p, what) \
754
int _cnt = ego->slvdescs_for_problem_kind[kind]; \
755
while (_cnt >= 0) { \
756
slvdesc *p = ego->slvdescs + _cnt; \
757
solver *s = p->slv; \
759
_cnt = p->next_for_same_problem_kind; \
764
/* make plan, destroy problem */
765
plan *X(mkplan_d)(planner *ego, problem *p);
766
plan *X(mkplan_f_d)(planner *ego, problem *p,
767
unsigned l_set, unsigned u_set, unsigned u_reset);
769
/*-----------------------------------------------------------------------*/
772
/* If PRECOMPUTE_ARRAY_INDICES is defined, precompute all strides. */
773
#if (defined(__i386__) || defined(__x86_64__) || _M_IX86 >= 500) && !HAVE_K7 && !defined(FFTW_LDOUBLE)
774
#define PRECOMPUTE_ARRAY_INDICES
777
#ifdef PRECOMPUTE_ARRAY_INDICES
779
#define WS(stride, i) (stride[i])
780
extern stride X(mkstride)(INT n, INT s);
781
void X(stride_destroy)(stride p);
782
#define MAKE_VOLATILE_STRIDE(x) (void)0 /* a no-op in expession context */
786
#define WS(stride, i) (stride * i)
787
#define fftwf_mkstride(n, stride) stride
788
#define fftw_mkstride(n, stride) stride
789
#define fftwl_mkstride(n, stride) stride
790
#define fftwf_stride_destroy(p) ((void) p)
791
#define fftw_stride_destroy(p) ((void) p)
792
#define fftwl_stride_destroy(p) ((void) p)
794
/* hackery to prevent the compiler from ``optimizing'' induction
795
variables in codelet loops. */
796
extern const stride X(a_stride_guaranteed_to_be_zero);
797
#define MAKE_VOLATILE_STRIDE(x) (x) = (x) ^ X(a_stride_guaranteed_to_be_zero)
799
#endif /* PRECOMPUTE_ARRAY_INDICES */
801
/*-----------------------------------------------------------------------*/
804
struct solvtab_s { void (*reg)(planner *); const char *reg_nam; };
805
typedef struct solvtab_s solvtab[];
806
void X(solvtab_exec)(const solvtab tbl, planner *p);
807
#define SOLVTAB(s) { s, STRINGIZE(s) }
808
#define SOLVTAB_END { 0, 0 }
810
/*-----------------------------------------------------------------------*/
812
int X(pickdim)(int which_dim, const int *buddies, int nbuddies,
813
const tensor *sz, int oop, int *dp);
815
/*-----------------------------------------------------------------------*/
817
/* little language to express twiddle factors computation */
818
enum { TW_COS = 0, TW_SIN = 1, TW_CEXP = 2, TW_NEXT = 3,
819
TW_FULL = 4, TW_HALF = 5 };
827
typedef struct twid_s {
828
R *W; /* array of twiddle factors */
829
INT n, r, m; /* transform order, radix, # twiddle rows */
831
const tw_instr *instr;
833
enum wakefulness wakefulness;
836
INT X(twiddle_length)(INT r, const tw_instr *p);
837
void X(twiddle_awake)(enum wakefulness wakefulness,
838
twid **pp, const tw_instr *instr, INT n, INT r, INT m);
839
const R *X(twiddle_shift)(const twid *p, INT mstart);
841
/*-----------------------------------------------------------------------*/
843
#ifdef TRIGREAL_IS_LONG_DOUBLE
844
typedef long double trigreal;
846
typedef double trigreal;
849
typedef struct triggen_s triggen;
852
void (*cexp)(triggen *t, INT m, R *result);
853
void (*cexpl)(triggen *t, INT m, trigreal *result);
854
void (*rotate)(triggen *p, INT m, R xr, R xi, R *res);
863
triggen *X(mktriggen)(enum wakefulness wakefulness, INT n);
864
void X(triggen_destroy)(triggen *p);
866
/*-----------------------------------------------------------------------*/
869
#define MULMOD(x, y, p) \
870
(((x) <= 92681 - (y)) ? ((x) * (y)) % (p) : X(safe_mulmod)(x, y, p))
872
INT X(safe_mulmod)(INT x, INT y, INT p);
873
INT X(power_mod)(INT n, INT m, INT p);
874
INT X(find_generator)(INT p);
875
INT X(first_divisor)(INT n);
876
int X(is_prime)(INT n);
877
INT X(next_prime)(INT n);
878
int X(factors_into)(INT n, const INT *primes);
879
INT X(choose_radix)(INT r, INT n);
882
#define GENERIC_MIN_BAD 173 /* min prime for which generic becomes bad */
884
/*-----------------------------------------------------------------------*/
886
typedef struct rader_tls rader_tl;
888
void X(rader_tl_insert)(INT k1, INT k2, INT k3, R *W, rader_tl **tl);
889
R *X(rader_tl_find)(INT k1, INT k2, INT k3, rader_tl *t);
890
void X(rader_tl_delete)(R *W, rader_tl **tl);
892
/*-----------------------------------------------------------------------*/
893
/* copy/transposition routines */
895
/* lower bound to the cache size, for tiled routines */
896
#define CACHESIZE 8192
898
INT X(compute_tilesz)(INT vl, int how_many_tiles_in_cache);
900
void X(tile2d)(INT n0l, INT n0u, INT n1l, INT n1u, INT tilesz,
901
void (*f)(INT n0l, INT n0u, INT n1l, INT n1u, void *args),
903
void X(cpy1d)(R *I, R *O, INT n0, INT is0, INT os0, INT vl);
904
void X(cpy2d)(R *I, R *O,
905
INT n0, INT is0, INT os0,
906
INT n1, INT is1, INT os1,
908
void X(cpy2d_ci)(R *I, R *O,
909
INT n0, INT is0, INT os0,
910
INT n1, INT is1, INT os1,
912
void X(cpy2d_co)(R *I, R *O,
913
INT n0, INT is0, INT os0,
914
INT n1, INT is1, INT os1,
916
void X(cpy2d_tiled)(R *I, R *O,
917
INT n0, INT is0, INT os0,
918
INT n1, INT is1, INT os1,
920
void X(cpy2d_tiledbuf)(R *I, R *O,
921
INT n0, INT is0, INT os0,
922
INT n1, INT is1, INT os1,
924
void X(cpy2d_pair)(R *I0, R *I1, R *O0, R *O1,
925
INT n0, INT is0, INT os0,
926
INT n1, INT is1, INT os1);
927
void X(cpy2d_pair_ci)(R *I0, R *I1, R *O0, R *O1,
928
INT n0, INT is0, INT os0,
929
INT n1, INT is1, INT os1);
930
void X(cpy2d_pair_co)(R *I0, R *I1, R *O0, R *O1,
931
INT n0, INT is0, INT os0,
932
INT n1, INT is1, INT os1);
934
void X(transpose)(R *I, INT n, INT s0, INT s1, INT vl);
935
void X(transpose_tiled)(R *I, INT n, INT s0, INT s1, INT vl);
936
void X(transpose_tiledbuf)(R *I, INT n, INT s0, INT s1, INT vl);
938
typedef void (*transpose_func)(R *I, INT n, INT s0, INT s1, INT vl);
939
typedef void (*cpy2d_func)(R *I, R *O,
940
INT n0, INT is0, INT os0,
941
INT n1, INT is1, INT os1,
944
/*-----------------------------------------------------------------------*/
946
void X(null_awake)(plan *ego, enum wakefulness wakefulness);
947
double X(iestimate_cost)(const plan *pln);
948
double X(measure_execution_time)(plan *pln, const problem *p);
949
int X(alignment_of)(R *p);
950
unsigned X(hash)(const char *s);
951
INT X(compute_nbuf)(INT n, INT vl, INT nbuf, INT maxbufsz);
952
int X(ct_uglyp)(INT min_n, INT n, INT r);
955
R *X(taint)(R *p, INT s);
956
R *X(join_taint)(R *p1, R *p2);
957
#define TAINT(p, s) X(taint)(p, s)
958
#define UNTAINT(p) ((R *) (((uintptr_t) (p)) & ~(uintptr_t)3))
959
#define TAINTOF(p) (((uintptr_t)(p)) & 3)
960
#define JOIN_TAINT(p1, p2) X(join_taint)(p1, p2)
962
#define TAINT(p, s) (p)
963
#define UNTAINT(p) (p)
965
#define JOIN_TAINT(p1, p2) p1
968
#ifdef FFTW_DEBUG_ALIGNMENT
969
# define ASSERT_ALIGNED_DOUBLE { \
971
CK(!(((uintptr_t) &__foo) & 0x7)); \
974
# define ASSERT_ALIGNED_DOUBLE
975
#endif /* FFTW_DEBUG_ALIGNMENT */
979
/*-----------------------------------------------------------------------*/
980
/* macros used in codelets to reduce source code size */
982
typedef R E; /* internal precision of codelets. */
985
# define K(x) ((E) x##L)
987
# define K(x) ((E) x)
989
#define DK(name, value) const E name = K(value)
993
#if defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__) || defined(_POWER))
994
/* The obvious expression a * b + c does not work. If both x = a * b
995
+ c and y = a * b - c appear in the source, gcc computes t = a * b,
996
x = t + c, y = t - c, thus destroying the fma.
998
This peculiar coding seems to do the right thing on all of
999
gcc-2.95, gcc-3.1, gcc-3.2, and gcc-3.3. It does the right thing
1000
on gcc-3.4 -fno-web (because the ``web'' pass splits the variable
1001
`x' for the single-assignment form).
1003
However, gcc-4.0 is a formidable adversary which succeeds in
1004
pessimizing two fma's into one multiplication and two additions.
1005
It does it very early in the game---before the optimization passes
1006
even start. The only real workaround seems to use fake inline asm
1009
asm ("# confuse gcc %0" : "=f"(a) : "0"(a));
1012
in each of the FMA, FMS, FNMA, and FNMS functions. However, this
1013
does not solve the problem either, because two equal asm statements
1014
count as a common subexpression! One must use *different* fake asm
1018
asm ("# confuse gcc for fma %0" : "=f"(a) : "0"(a));
1021
asm ("# confuse gcc for fms %0" : "=f"(a) : "0"(a));
1025
After these changes, gcc recalcitrantly generates the fma that was
1026
in the source to begin with. However, the extra asm() cruft
1027
confuses other passes of gcc, notably the instruction scheduler.
1028
(Of course, one could also generate the fma directly via inline
1029
asm, but this confuses the scheduler even more.)
1031
Steven and I have submitted more than one bug report to the gcc
1032
mailing list over the past few years, to no effect. Thus, I give
1033
up. gcc-4.0 can go to hell. I'll wait at least until gcc-4.3 is
1034
out before touching this crap again.
1036
static __inline__ E FMA(E a, E b, E c)
1043
static __inline__ E FMS(E a, E b, E c)
1050
static __inline__ E FNMA(E a, E b, E c)
1057
static __inline__ E FNMS(E a, E b, E c)
1064
#define FMA(a, b, c) (((a) * (b)) + (c))
1065
#define FMS(a, b, c) (((a) * (b)) - (c))
1066
#define FNMA(a, b, c) (- (((a) * (b)) + (c)))
1067
#define FNMS(a, b, c) ((c) - ((a) * (b)))
1070
/* stack-alignment hackery */
1071
#ifdef __ICC /* Intel's compiler for ia32 */
1072
#define WITH_ALIGNED_STACK(what) \
1075
* Simply calling alloca seems to do the right thing. \
1076
* The size of the allocated block seems to be irrelevant. \
1083
#if defined(__GNUC__) && defined(__i386__) && !defined(WITH_ALIGNED_STACK) \
1084
&& !(defined(__MACOSX__) || defined(__APPLE__)) /* OSX ABI is aligned */
1086
* horrible hack to align the stack to a 16-byte boundary.
1088
* We assume a gcc version >= 2.95 so that
1089
* -mpreferred-stack-boundary works. Otherwise, all bets are
1090
* off. However, -mpreferred-stack-boundary does not create a
1091
* stack alignment, but it only preserves it. Unfortunately,
1092
* many versions of libc on linux call main() with the wrong
1093
* initial stack alignment, with the result that the code is now
1094
* pessimally aligned instead of having a 50% chance of being
1098
#define WITH_ALIGNED_STACK(what) \
1101
* Use alloca to allocate some memory on the stack. \
1102
* This alerts gcc that something funny is going \
1103
* on, so that it does not omit the frame pointer \
1106
(void)__builtin_alloca(16); \
1109
* Now align the stack pointer \
1111
__asm__ __volatile__ ("andl $-16, %esp"); \
1117
#ifndef WITH_ALIGNED_STACK
1118
#define WITH_ALIGNED_STACK(what) what
1121
#endif /* __IFFTW_H__ */