2
* Copyright (c) 2003, 2007-8 Matteo Frigo
3
* Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology
5
* This program is free software; you can redistribute it and/or modify
6
* it under the terms of the GNU General Public License as published by
7
* the Free Software Foundation; either version 2 of the License, or
8
* (at your option) any later version.
10
* This program is distributed in the hope that it will be useful,
11
* but WITHOUT ANY WARRANTY; without even the implied warranty of
12
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
* GNU General Public License for more details.
15
* You should have received a copy of the GNU General Public License
16
* along with this program; if not, write to the Free Software
17
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22
#error "SSE only works in single precision"
25
#define VL 2 /* SIMD complex vector length */
26
#define ALIGNMENT 8 /* alignment for LD/ST */
27
#define ALIGNMENTA 16 /* alignment for LDA/STA */
28
#define SIMD_VSTRIDE_OKA(x) ((x) == 2)
29
#define SIMD_STRIDE_OKPAIR SIMD_STRIDE_OK
31
#define RIGHT_CPU X(have_sse)
32
extern int RIGHT_CPU(void);
34
/* gcc compiles the following code only when __SSE__ is defined */
35
#if defined(__SSE__) || !defined(__GNUC__)
37
/* some versions of glibc's sys/cdefs.h define __inline to be empty,
38
which is wrong because xmmintrin.h defines several inline
42
#include <xmmintrin.h>
45
#define VADD _mm_add_ps
46
#define VSUB _mm_sub_ps
47
#define VMUL _mm_mul_ps
48
#define VXOR _mm_xor_ps
49
#define SHUFPS _mm_shuffle_ps
50
#define STOREH(addr, val) _mm_storeh_pi((__m64 *)(addr), val)
51
#define STOREL(addr, val) _mm_storel_pi((__m64 *)(addr), val)
52
#define UNPCKH _mm_unpackhi_ps
53
#define UNPCKL _mm_unpacklo_ps
56
# define DVK(var, val) const V var = __extension__ ({ \
57
static const union fvec _var = { {val, val, val, val} }; \
62
/* we use inline asm because gcc generates slow code for
63
_mm_loadh_pi(). gcc insists upon having an existing variable for
64
VAL, which is however never used. Thus, it generates code to move
65
values in and out the variable. Worse still, gcc-4.0 stores VAL on
66
the stack, causing valgrind to complain about uninitialized reads.
69
static inline V LD(const R *x, INT ivs, const R *aligned_like)
72
(void)aligned_like; /* UNUSED */
73
__asm__("movlps %1, %0\n\tmovhps %2, %0"
74
: "=x"(var) : "m"(x[0]), "m"(x[ivs]));
80
# define DVK(var, val) const R var = K(val)
81
# define LDK(x) _mm_set_ps1(x)
82
# define LOADH(addr, val) _mm_loadh_pi(val, (const __m64 *)(addr))
83
# define LOADL0(addr, val) _mm_loadl_pi(val, (const __m64 *)(addr))
85
static inline V LD(const R *x, INT ivs, const R *aligned_like)
88
(void)aligned_like; /* UNUSED */
90
var = LOADH(x + ivs, var);
106
#define VFMA(a, b, c) VADD(c, VMUL(a, b))
107
#define VFNMS(a, b, c) VSUB(c, VMUL(a, b))
108
#define VFMS(a, b, c) VSUB(VMUL(a, b), c)
110
#define SHUFVAL(fp0,fp1,fp2,fp3) \
111
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
114
static inline V LDA(const R *x, INT ivs, const R *aligned_like)
116
(void)aligned_like; /* UNUSED */
117
(void)ivs; /* UNUSED */
118
return *(const V *)x;
121
static inline void ST(R *x, V v, INT ovs, const R *aligned_like)
123
(void)aligned_like; /* UNUSED */
124
/* WARNING: the extra_iter hack depends upon STOREL occurring
130
static inline void STA(R *x, V v, INT ovs, const R *aligned_like)
132
(void)aligned_like; /* UNUSED */
133
(void)ovs; /* UNUSED */
138
/* this should be faster but it isn't. */
139
static inline void STN2(R *x, V v0, V v1, INT ovs)
141
STA(x, SHUFPS(v0, v1, SHUFVAL(0, 1, 0, 1)), ovs, 0);
142
STA(x + ovs, SHUFPS(v0, v1, SHUFVAL(2, 3, 2, 3)), ovs, 0);
146
#define STN2(x, v0, v1, ovs) /* nop */
148
#define STM4(x, v, ovs, aligned_like) /* no-op */
150
#ifdef VISUAL_CXX_DOES_NOT_SUCK
151
static inline void STN4(R *x, V v0, V v1, V v2, V v3, INT ovs)
158
STA(x, UNPCKL(x0, x2), 0, 0);
159
STA(x + ovs, UNPCKH(x0, x2), 0, 0);
160
STA(x + 2 * ovs, UNPCKL(x1, x3), 0, 0);
161
STA(x + 3 * ovs, UNPCKH(x1, x3), 0, 0);
163
#else /* Visual C++ sucks */
166
Straight from the mouth of the horse:
168
We "reserved" the possibility of aligning arguments with
169
__declspec(align(X)) passed by value by issuing this error.
171
The first 3 parameters of type __m64 (or other MMX types) are
172
passed in registers. The rest would be passed on the stack. We
173
decided aligning the stack was wasteful, especially for __m128
174
parameters. Also, we thought it would be infrequent that people
175
would want to pass more than 3 by value.
177
If we didn't issue an error, we would have to binary compatibility
178
in the future if we decided to align the arguments.
181
Hope that explains it.
183
Jason Shirk, Visual C++ Compiler Team
184
This posting is provided AS IS with no warranties, and confers no rights
187
#define STN4(x, v0, v1, v2, v3, ovs) \
189
V xxx0, xxx1, xxx2, xxx3; \
190
xxx0 = UNPCKL(v0, v2); \
191
xxx1 = UNPCKH(v0, v2); \
192
xxx2 = UNPCKL(v1, v3); \
193
xxx3 = UNPCKH(v1, v3); \
194
STA(x, UNPCKL(xxx0, xxx2), 0, 0); \
195
STA(x + ovs, UNPCKH(xxx0, xxx2), 0, 0); \
196
STA(x + 2 * ovs, UNPCKL(xxx1, xxx3), 0, 0); \
197
STA(x + 3 * ovs, UNPCKH(xxx1, xxx3), 0, 0); \
201
static inline V FLIP_RI(V x)
203
return SHUFPS(x, x, SHUFVAL(1, 0, 3, 2));
206
extern const union uvec X(sse_pmpm);
207
static inline V VCONJ(V x)
209
return VXOR(X(sse_pmpm).v, x);
212
static inline V VBYI(V x)
214
return FLIP_RI(VCONJ(x));
217
static inline V VZMUL(V tx, V sr)
219
V tr = SHUFPS(tx, tx, SHUFVAL(0, 0, 2, 2));
220
V ti = SHUFPS(tx, tx, SHUFVAL(1, 1, 3, 3));
223
return VADD(tr, VMUL(ti, sr));
226
static inline V VZMULJ(V tx, V sr)
228
V tr = SHUFPS(tx, tx, SHUFVAL(0, 0, 2, 2));
229
V ti = SHUFPS(tx, tx, SHUFVAL(1, 1, 3, 3));
232
return VSUB(tr, VMUL(ti, sr));
235
static inline V VZMULI(V tx, V sr)
237
V tr = SHUFPS(tx, tx, SHUFVAL(0, 0, 2, 2));
238
V ti = SHUFPS(tx, tx, SHUFVAL(1, 1, 3, 3));
241
return VSUB(VMUL(tr, sr), ti);
244
static inline V VZMULIJ(V tx, V sr)
246
V tr = SHUFPS(tx, tx, SHUFVAL(0, 0, 2, 2));
247
V ti = SHUFPS(tx, tx, SHUFVAL(1, 1, 3, 3));
250
return VADD(VMUL(tr, sr), ti);
253
#define VFMAI(b, c) VADD(c, VBYI(b))
254
#define VFNMSI(b, c) VSUB(c, VBYI(b))
256
/* twiddle storage #1: compact, slower */
258
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_SIN, v, x}, {TW_SIN, v+1, x}
261
static inline V BYTW1(const R *t, V sr)
263
const V *twp = (const V *)t;
265
V tr = UNPCKL(tx, tx);
266
V ti = UNPCKH(tx, tx);
269
return VADD(tr, VMUL(ti, sr));
272
static inline V BYTWJ1(const R *t, V sr)
274
const V *twp = (const V *)t;
276
V tr = UNPCKL(tx, tx);
277
V ti = UNPCKH(tx, tx);
280
return VSUB(tr, VMUL(ti, sr));
283
/* twiddle storage #2: twice the space, faster (when in cache) */
285
{TW_COS, v, x}, {TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+1, x}, \
286
{TW_SIN, v, -x}, {TW_SIN, v, x}, {TW_SIN, v+1, -x}, {TW_SIN, v+1, x}
287
#define TWVL2 (2 * VL)
289
static inline V BYTW2(const R *t, V sr)
291
const V *twp = (const V *)t;
293
V tr = twp[0], ti = twp[1];
294
return VADD(VMUL(tr, sr), VMUL(ti, si));
297
static inline V BYTWJ2(const R *t, V sr)
299
const V *twp = (const V *)t;
301
V tr = twp[0], ti = twp[1];
302
return VSUB(VMUL(tr, sr), VMUL(ti, si));
305
/* twiddle storage #3 */
306
#define VTW3(v,x) {TW_CEXP, v, x}, {TW_CEXP, v+1, x}
309
/* twiddle storage for split arrays */
311
{TW_COS, v, x}, {TW_COS, v+1, x}, {TW_COS, v+2, x}, {TW_COS, v+3, x}, \
312
{TW_SIN, v, x}, {TW_SIN, v+1, x}, {TW_SIN, v+2, x}, {TW_SIN, v+3, x}
313
#define TWVLS (2 * VL)